This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new f633cd64 feat(r/sedonadb): Implement `dplyr::select()` for
sedonadb_dataframe (#419)
f633cd64 is described below
commit f633cd64cdbb56828982895b6d5661cb339dd2f9
Author: Dewey Dunnington <[email protected]>
AuthorDate: Tue Dec 9 10:53:49 2025 -0600
feat(r/sedonadb): Implement `dplyr::select()` for sedonadb_dataframe (#419)
---
r/sedonadb/DESCRIPTION | 3 +++
r/sedonadb/R/000-wrappers.R | 7 ++++++
r/sedonadb/R/pkg-dplyr.R | 30 ++++++++++++++++++++++++
r/sedonadb/R/zzz.R | 3 +++
r/sedonadb/src/init.c | 10 ++++++++
r/sedonadb/src/rust/api.h | 2 ++
r/sedonadb/src/rust/src/dataframe.rs | 26 +++++++++++++++++----
r/sedonadb/tests/testthat/test-pkg-dplyr.R | 37 ++++++++++++++++++++++++++++++
8 files changed, 113 insertions(+), 5 deletions(-)
diff --git a/r/sedonadb/DESCRIPTION b/r/sedonadb/DESCRIPTION
index 1f941adf..4876e3be 100644
--- a/r/sedonadb/DESCRIPTION
+++ b/r/sedonadb/DESCRIPTION
@@ -13,9 +13,12 @@ SystemRequirements: Cargo (Rust's package manager), rustc
Depends: R (>= 4.1.0)
Suggests:
adbcdrivermanager,
+ dplyr,
rlang,
+ tibble,
sf,
testthat (>= 3.0.0),
+ tidyselect,
withr,
wk
Imports:
diff --git a/r/sedonadb/R/000-wrappers.R b/r/sedonadb/R/000-wrappers.R
index 72e50d28..df1f61fd 100644
--- a/r/sedonadb/R/000-wrappers.R
+++ b/r/sedonadb/R/000-wrappers.R
@@ -186,6 +186,12 @@ class(`InternalContext`) <-
c("sedonadb::InternalContext__bundle", "savvy_sedona
}
}
+`InternalDataFrame_select_indices` <- function(self) {
+ function(`names`, `indices`) {
+
.savvy_wrap_InternalDataFrame(.Call(savvy_InternalDataFrame_select_indices__impl,
`self`, `names`, `indices`))
+ }
+}
+
`InternalDataFrame_show` <- function(self) {
function(`ctx`, `width_chars`, `ascii`, `limit` = NULL) {
`ctx` <- .savvy_extract_ptr(`ctx`, "sedonadb::InternalContext")
@@ -233,6 +239,7 @@ class(`InternalContext`) <-
c("sedonadb::InternalContext__bundle", "savvy_sedona
e$`count` <- `InternalDataFrame_count`(ptr)
e$`limit` <- `InternalDataFrame_limit`(ptr)
e$`primary_geometry_column_index` <-
`InternalDataFrame_primary_geometry_column_index`(ptr)
+ e$`select_indices` <- `InternalDataFrame_select_indices`(ptr)
e$`show` <- `InternalDataFrame_show`(ptr)
e$`to_arrow_schema` <- `InternalDataFrame_to_arrow_schema`(ptr)
e$`to_arrow_stream` <- `InternalDataFrame_to_arrow_stream`(ptr)
diff --git a/r/sedonadb/R/pkg-dplyr.R b/r/sedonadb/R/pkg-dplyr.R
new file mode 100644
index 00000000..817dca65
--- /dev/null
+++ b/r/sedonadb/R/pkg-dplyr.R
@@ -0,0 +1,30 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+collect.sedonadb_dataframe <- function(x, ...) {
+ rlang::check_dots_empty()
+ tibble::as_tibble(sd_collect(x))
+}
+
+select.sedonadb_dataframe <- function(.data, ...) {
+ schema <- nanoarrow::infer_nanoarrow_schema(.data)
+ ptype <- nanoarrow::infer_nanoarrow_ptype(schema)
+ loc <- tidyselect::eval_select(rlang::expr(c(...)), data = ptype)
+
+ df <- .data$df$select_indices(names(loc), loc - 1L)
+ new_sedonadb_dataframe(.data$ctx, df)
+}
diff --git a/r/sedonadb/R/zzz.R b/r/sedonadb/R/zzz.R
index e286d550..9f9720cc 100644
--- a/r/sedonadb/R/zzz.R
+++ b/r/sedonadb/R/zzz.R
@@ -21,6 +21,9 @@
s3_register("sf::st_as_sf", "sedonadb_dataframe")
+ s3_register("dplyr::collect", "sedonadb_dataframe")
+ s3_register("dplyr::select", "sedonadb_dataframe")
+
# Inject what we need to reduce the Rust code to a simple Rf_eval()
ns <- asNamespace("sedonadb")
call <- call("check_interrupts")
diff --git a/r/sedonadb/src/init.c b/r/sedonadb/src/init.c
index f72a3cab..8405b2cc 100644
--- a/r/sedonadb/src/init.c
+++ b/r/sedonadb/src/init.c
@@ -156,6 +156,14 @@ SEXP
savvy_InternalDataFrame_primary_geometry_column_index__impl(SEXP self__) {
return handle_result(res);
}
+SEXP savvy_InternalDataFrame_select_indices__impl(SEXP self__,
+ SEXP c_arg__names,
+ SEXP c_arg__indices) {
+ SEXP res = savvy_InternalDataFrame_select_indices__ffi(self__, c_arg__names,
+ c_arg__indices);
+ return handle_result(res);
+}
+
SEXP savvy_InternalDataFrame_show__impl(SEXP self__, SEXP c_arg__ctx,
SEXP c_arg__width_chars,
SEXP c_arg__ascii, SEXP c_arg__limit) {
@@ -236,6 +244,8 @@ static const R_CallMethodDef CallEntries[] = {
(DL_FUNC)&savvy_InternalDataFrame_limit__impl, 2},
{"savvy_InternalDataFrame_primary_geometry_column_index__impl",
(DL_FUNC)&savvy_InternalDataFrame_primary_geometry_column_index__impl, 1},
+ {"savvy_InternalDataFrame_select_indices__impl",
+ (DL_FUNC)&savvy_InternalDataFrame_select_indices__impl, 3},
{"savvy_InternalDataFrame_show__impl",
(DL_FUNC)&savvy_InternalDataFrame_show__impl, 5},
{"savvy_InternalDataFrame_to_arrow_schema__impl",
diff --git a/r/sedonadb/src/rust/api.h b/r/sedonadb/src/rust/api.h
index 268c046a..201039e1 100644
--- a/r/sedonadb/src/rust/api.h
+++ b/r/sedonadb/src/rust/api.h
@@ -44,6 +44,8 @@ SEXP savvy_InternalDataFrame_compute__ffi(SEXP self__, SEXP
c_arg__ctx);
SEXP savvy_InternalDataFrame_count__ffi(SEXP self__);
SEXP savvy_InternalDataFrame_limit__ffi(SEXP self__, SEXP c_arg__n);
SEXP savvy_InternalDataFrame_primary_geometry_column_index__ffi(SEXP self__);
+SEXP savvy_InternalDataFrame_select_indices__ffi(SEXP self__, SEXP
c_arg__names,
+ SEXP c_arg__indices);
SEXP savvy_InternalDataFrame_show__ffi(SEXP self__, SEXP c_arg__ctx,
SEXP c_arg__width_chars,
SEXP c_arg__ascii, SEXP c_arg__limit);
diff --git a/r/sedonadb/src/rust/src/dataframe.rs
b/r/sedonadb/src/rust/src/dataframe.rs
index 6f6f98c2..45a3cf97 100644
--- a/r/sedonadb/src/rust/src/dataframe.rs
+++ b/r/sedonadb/src/rust/src/dataframe.rs
@@ -14,23 +14,22 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use std::ptr::swap_nonoverlapping;
-use std::sync::Arc;
use arrow_array::ffi::FFI_ArrowSchema;
use arrow_array::ffi_stream::FFI_ArrowArrayStream;
use arrow_array::{RecordBatchIterator, RecordBatchReader};
use datafusion::catalog::MemTable;
-use datafusion::{logical_expr::SortExpr, prelude::DataFrame};
+use datafusion::prelude::DataFrame;
use datafusion_common::Column;
-use datafusion_expr::Expr;
+use datafusion_expr::{select_expr::SelectExpr, Expr, SortExpr};
use datafusion_ffi::table_provider::FFI_TableProvider;
-use savvy::{savvy, savvy_err, IntoExtPtrSexp, Result};
+use savvy::{savvy, savvy_err, sexp, IntoExtPtrSexp, Result};
use sedona::context::{SedonaDataFrame, SedonaWriteOptions};
use sedona::reader::SedonaStreamReader;
use sedona::show::{DisplayMode, DisplayTableOptions};
use sedona_geoparquet::options::{GeoParquetVersion, TableGeoParquetOptions};
use sedona_schema::schema::SedonaSchema;
+use std::{iter::zip, ptr::swap_nonoverlapping, sync::Arc};
use tokio::runtime::Runtime;
use crate::context::InternalContext;
@@ -292,4 +291,21 @@ impl InternalDataFrame {
Ok(())
}
+
+ fn select_indices(&self, names: sexp::Sexp, indices: sexp::Sexp) ->
Result<InternalDataFrame> {
+ let names_strsxp = savvy::StringSexp::try_from(names)?;
+ let indices_intsxp = savvy::IntegerSexp::try_from(indices)?;
+
+ let df_schema = self.inner.schema();
+ let exprs = zip(names_strsxp.iter(), indices_intsxp.iter())
+ .map(|(name, index)| {
+ let (table_ref, field) =
df_schema.qualified_field(usize::try_from(*index)?);
+ let column = Column::new(table_ref.cloned(), field.name());
+ Ok(SelectExpr::Expression(Expr::Column(column).alias(name)))
+ })
+ .collect::<Result<Vec<_>>>()?;
+
+ let inner = self.inner.clone().select(exprs)?;
+ Ok(new_data_frame(inner, self.runtime.clone()))
+ }
}
diff --git a/r/sedonadb/tests/testthat/test-pkg-dplyr.R
b/r/sedonadb/tests/testthat/test-pkg-dplyr.R
new file mode 100644
index 00000000..399a718a
--- /dev/null
+++ b/r/sedonadb/tests/testthat/test-pkg-dplyr.R
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+test_that("select() works for sedonadb_dataframe", {
+ skip_if_not_installed("dplyr")
+
+ df <- sd_sql("SELECT 1 as one, 'two' as two, 3.0 as \"THREE\"")
+
+ expect_identical(
+ df |> dplyr::select(2:3) |> dplyr::collect(),
+ tibble::tibble(two = "two", THREE = 3.0)
+ )
+
+ expect_identical(
+ df |> dplyr::select(three_renamed = THREE, one) |> dplyr::collect(),
+ tibble::tibble(three_renamed = 3.0, one = 1)
+ )
+
+ expect_identical(
+ df |> dplyr::select(TWO = two) |> dplyr::collect(),
+ tibble::tibble(TWO = "two")
+ )
+})