Copilot commented on code in PR #658:
URL: https://github.com/apache/sedona-db/pull/658#discussion_r2849072502


##########
r/sedonadb/src/rust/src/context.rs:
##########
@@ -40,12 +44,26 @@ pub struct InternalContext {
 
 #[savvy]
 impl InternalContext {
-    pub fn new() -> Result<Self> {
+    pub fn new(option_keys: savvy::Sexp, option_values: savvy::Sexp) -> 
Result<Self> {
         let runtime = tokio::runtime::Builder::new_multi_thread()
             .enable_all()
             .build()?;
 
-        let inner = wait_for_future_captured_r(&runtime, 
SedonaContext::new_local_interactive())??;
+        let keys = savvy::StringSexp::try_from(option_keys)?;
+        let values = savvy::StringSexp::try_from(option_values)?;
+
+        let options: HashMap<String, String> = keys
+            .iter()
+            .zip(values.iter())
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();

Review Comment:
   `keys.iter().zip(values.iter())` will silently truncate to the shorter of 
the two vectors if their lengths differ, potentially dropping runtime options 
without any error. Add an explicit length check and return an error when 
`option_keys` and `option_values` lengths don't match.



##########
r/sedonadb/R/context.R:
##########
@@ -15,10 +15,110 @@
 # specific language governing permissions and limitations
 # under the License.
 
+#' Create a SedonaDB context
+#'
+#' Runtime options configure the execution environment. Use
+#' `global = TRUE` to configure the global context or use the
+#' returned object as a scoped context. A scoped context is
+#' recommended for programmatic usage as it prevents named
+#' views from interfering with each other.
+#'
+#' @param global Use TRUE to set options on the global context.
+#' @param memory_limit Maximum memory for query execution, as a
+#'   human-readable string (e.g., `"4gb"`, `"512m"`) or `NULL` for
+#'   unbounded (the default).
+#' @param temp_dir Directory for temporary/spill files, or `NULL` to
+#'   use the DataFusion default.
+#' @param memory_pool_type Memory pool type: `"greedy"` (default) or
+#'   `"fair"`. Only takes effect when `memory_limit` is set.
+#' @param unspillable_reserve_ratio Fraction of memory (0--1) reserved for
+#'   unspillable consumers. Only applies when `memory_pool_type` is
+#'   `"fair"`. Defaults to 0.2 when not explicitly set.
+#' @param ... Reserved for future options
+#'
+#' @returns The constructed context, invisibly.
+#' @export
+#'
+#' @examples
+#' sd_connect(memory_limit = "100mb", memory_pool_type = "fair")
+#'
+#'
+sd_connect <- function(
+  ...,
+  global = FALSE,
+  memory_limit = NULL,
+  temp_dir = NULL,
+  memory_pool_type = NULL,
+  unspillable_reserve_ratio = NULL
+) {
+  unsupported_options <- list(...)
+  if (length(unsupported_options) != 0) {
+    warning(
+      sprintf(
+        "Unrecognized options for sd_connect(): %s",
+        paste(names(unsupported_options), collapse = ", ")
+      )
+    )
+  }
+
+  options <- list()
+
+  if (!is.null(memory_limit)) {
+    stopifnot(is.character(memory_limit), length(memory_limit) == 1L)
+    options[["memory_limit"]] <- memory_limit
+  }
+
+  if (!is.null(temp_dir)) {
+    stopifnot(is.character(temp_dir), length(temp_dir) == 1L)
+    options[["temp_dir"]] <- temp_dir
+  }
+
+  if (!is.null(memory_pool_type)) {
+    memory_pool_type <- match.arg(memory_pool_type, c("greedy", "fair"))
+    options[["memory_pool_type"]] <- memory_pool_type
+  }
+
+  if (!is.null(unspillable_reserve_ratio)) {
+    stopifnot(
+      is.numeric(unspillable_reserve_ratio),
+      length(unspillable_reserve_ratio) == 1L,
+      unspillable_reserve_ratio >= 0,
+      unspillable_reserve_ratio <= 1
+    )
+    options[["unspillable_reserve_ratio"]] <- as.character(
+      unspillable_reserve_ratio
+    )
+  }
+
+  # Don't replace the global context
+  if (global && length(options) > 0 && !is.null(global_ctx$ctx)) {
+    warning(
+      "Cannot change runtime options after the context has been initialized. ",
+      "Set global options with sd_connect() before executing your first query 
",
+      "or use a scoped context by passing global = FALSE"
+    )
+
+    return(invisible(global_ctx$ctx))
+  } else if (global && !is.null(global_ctx$ctx)) {
+    return(invisible(global_ctx$ctx))
+  }
+
+  keys <- as.character(names(options))
+  values <- as.character(options)

Review Comment:
   `values <- as.character(options)` will error when `options` is a non-empty 
list (base R can't coerce a list to character this way), which breaks 
`sd_connect(memory_limit=..., ...)`. Convert the list to a character vector 
explicitly (e.g., via `unlist(options, use.names = FALSE)`) and ensure 
keys/values lengths match before calling into Rust.
   ```suggestion
     values <- as.character(unlist(options, use.names = FALSE))
     if (length(keys) != length(values)) {
       stop("Internal error: option keys/values length mismatch")
     }
   ```



##########
r/sedonadb/R/dataframe.R:
##########
@@ -109,8 +132,12 @@ sd_count <- function(.data) {
 #' sd_sql("SELECT 1 as one") |> sd_to_view("foofy")
 #' sd_sql("SELECT * FROM foofy")
 #'
-sd_to_view <- function(.data, table_ref, overwrite = FALSE) {
-  .data <- as_sedonadb_dataframe(.data)
+sd_to_view <- function(.data, table_ref, ctx = NULL, overwrite = FALSE) {

Review Comment:
   Adding `ctx` before `overwrite` changes the positional argument order of 
`sd_to_view()`. Existing calls like `sd_to_view(df, "name", TRUE)` (positional 
`overwrite`) will now pass `TRUE` as `ctx` and error. Consider keeping 
`overwrite` as the 3rd argument (and placing `ctx` after it) to avoid a 
breaking change.
   ```suggestion
   sd_to_view <- function(.data, table_ref, overwrite = FALSE, ctx = NULL) {
   ```



##########
r/sedonadb/R/context.R:
##########
@@ -88,28 +214,47 @@ sd_view <- function(table_ref) {
 #' to a Rust `FFI_ScalarUDF`, an example of which is available from the
 #' [DataFusion Python 
documentation](https://github.com/apache/datafusion-python/blob/6f3b1cab75cfaa0cdf914f9b6fa023cb9afccd7d/examples/datafusion-ffi-example/src/scalar_udf.rs).
 #'
+#' @param ctx A SedonaDB context.
 #' @param udf An object of class 'datafusion_scalar_udf'
 #'
 #' @returns NULL, invisibly
 #' @export
 #'
 sd_register_udf <- function(udf) {
-  ctx <- ctx()
+  sd_ctx_register_udf(ctx(), udf)
+}
+
+#' @rdname sd_register_udf
+#' @export
+sd_ctx_register_udf <- function(ctx, udf) {

Review Comment:
   `sd_ctx_register_udf()` doesn't validate `ctx` with `check_ctx()`, unlike 
the other `sd_ctx_*` helpers. This makes errors from invalid inputs harder to 
diagnose and is inconsistent with the rest of the context API.
   ```suggestion
   sd_ctx_register_udf <- function(ctx, udf) {
     check_ctx(ctx)
   ```



##########
r/sedonadb/src/rust/src/dataframe.rs:
##########
@@ -286,13 +280,25 @@ impl InternalDataFrame {
             .clone();
         writer_options.inner.global = global_parquet_options;
 
+        // Apply user-specified options
+        for (k, v) in 
option_keys_strsxp.iter().zip(option_values_strsxp.iter()) {
+            writer_options
+                .set(k, v)
+                .map_err(|e| savvy::Error::new(format!("{e}")))?;
+        }

Review Comment:
   The loop over `option_keys_strsxp.iter().zip(option_values_strsxp.iter())` 
will silently ignore extra keys or values if the lengths differ. Even if the R 
wrapper usually validates this, it's safer to check equality here and return a 
clear error to prevent partially-applied writer options.



##########
r/sedonadb/R/dataframe.R:
##########
@@ -491,11 +529,14 @@ sd_summarize <- function(.data, ...) {
 sd_write_parquet <- function(
   .data,
   path,
+  options = NULL,
   partition_by = character(0),
   sort_by = character(0),

Review Comment:
   Adding `options` as the 3rd parameter changes the positional argument order 
of `sd_write_parquet()`. Any existing calls that passed `partition_by` 
positionally (e.g., `sd_write_parquet(df, path, "col")`) will now be 
interpreted as `options` and fail. To preserve backward compatibility, consider 
moving `options` after the existing arguments (or only accepting it via 
`...`/named argument).



##########
r/sedonadb/R/dataframe.R:
##########
@@ -507,22 +548,50 @@ sd_write_parquet <- function(
     single_file_output <- length(partition_by) == 0 && grepl("\\.parquet$", 
path)
   }
 
-  # Validate geoparquet_version
+  # Build the options list: start with user-provided options, then override
+  # with explicitly-specified arguments
+  if (is.null(options)) {
+    options <- list()
+  } else {
+    options <- as.list(options)
+  }
+
+  if (!is.null(max_row_group_size)) {
+    options[["max_row_group_size"]] <- 
as.character(as.integer(max_row_group_size))
+  }
+
+  if (!is.null(compression)) {
+    options[["compression"]] <- as.character(compression)
+  }
+
+  # Validate and apply geoparquet_version
   if (!is.null(geoparquet_version)) {
-    if (!geoparquet_version %in% c("1.0", "1.1")) {
-      stop("geoparquet_version must be '1.0' or '1.1'")
-    }
+    options[["geoparquet_version"]] <- as.character(geoparquet_version)
+  }
+

Review Comment:
   `overwrite_bbox_columns` is coerced to a lowercased string via 
`tolower(as.character(...))` without checking it's a single logical. Passing 
non-logical values (e.g., "yes", 1, NA) will yield strings that may not parse 
as booleans in the underlying option setter. Validate 
`is.logical(overwrite_bbox_columns)` and `length(...) == 1L` (and not `NA`) 
before converting.
   ```suggestion
   
     if (!is.logical(overwrite_bbox_columns) || length(overwrite_bbox_columns) 
!= 1L || is.na(overwrite_bbox_columns)) {
       stop("`overwrite_bbox_columns` must be a single non-missing logical 
value")
     }
   ```



##########
r/sedonadb/R/dataframe.R:
##########
@@ -507,22 +548,50 @@ sd_write_parquet <- function(
     single_file_output <- length(partition_by) == 0 && grepl("\\.parquet$", 
path)
   }
 
-  # Validate geoparquet_version
+  # Build the options list: start with user-provided options, then override
+  # with explicitly-specified arguments
+  if (is.null(options)) {
+    options <- list()
+  } else {
+    options <- as.list(options)
+  }
+
+  if (!is.null(max_row_group_size)) {
+    options[["max_row_group_size"]] <- 
as.character(as.integer(max_row_group_size))

Review Comment:
   `max_row_group_size` is coerced with `as.integer()` without validating 
type/range. If the input is non-numeric, fractional, `NA`, or <= 0, it will 
silently truncate or become `NA` and then be passed to Rust as a string, 
producing confusing downstream errors. Add explicit validation (numeric scalar, 
finite, >= 1, integer-ish) before populating the option.
   ```suggestion
       # Validate max_row_group_size: numeric scalar, finite, >= 1, integer-ish
       if (!is.numeric(max_row_group_size) ||
           length(max_row_group_size) != 1L ||
           !is.finite(max_row_group_size)) {
         stop("max_row_group_size must be a finite numeric value of length 1")
       }
       if (max_row_group_size < 1) {
         stop("max_row_group_size must be greater than or equal to 1")
       }
       if (abs(max_row_group_size - round(max_row_group_size)) > 
sqrt(.Machine$double.eps)) {
         stop("max_row_group_size must be an integer value")
       }
   
       max_row_group_size_int <- as.integer(round(max_row_group_size))
       options[["max_row_group_size"]] <- as.character(max_row_group_size_int)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to