Copilot commented on code in PR #670: URL: https://github.com/apache/sedona-db/pull/670#discussion_r2866992758
########## r/sedonadb/tests/testthat/test-datasource.R: ########## @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("sd_read_sf() works for layers with named geometry columns", { + skip_if_not_installed("sf") + + nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") + + from_stream <- sf::st_as_sf(sd_read_sf(nc_gpkg)) + from_sf <- sf::st_read(nc_gpkg, quiet = TRUE) + + # Expect identical CRS + expect_true(sf::st_crs(from_stream) == sf::st_crs(from_sf)) + + # Expect identical content without CRS + expect_equal( + from_stream |> sf::st_set_crs(NA) |> as.data.frame(), + from_sf |> sf::st_set_crs(NA) |> as.data.frame() + ) +}) + +test_that("sd_read_sf() works for layers with unnamed geometry columns", { + skip_if_not_installed("sf") + + nc_shp <- system.file("shape/nc.shp", package = "sf") + + from_stream <- sf::st_as_sf(sd_read_sf(nc_shp)) + from_sf <- sf::st_read(nc_shp, quiet = TRUE, promote_to_multi = FALSE) + + # Expect identical CRS + expect_true(sf::st_crs(from_stream) == sf::st_crs(from_sf)) + + # The from_stream version has a geometry column named "wkb_geometry" but + # sf renames this internally to "geometry" + expect_true("wkb_geometry" %in% names(from_stream)) + colnames(from_stream)[colnames(from_stream) == "wkb_geometry"] <- "geometry" + sf::st_geometry(from_stream) <- "geometry" + + # Expect identical content without CRS + expect_equal( + from_stream |> sf::st_set_crs(NA) |> as.data.frame(), + from_sf |> sf::st_set_crs(NA) |> as.data.frame() + ) +}) + +test_that("sd_read_sf() works for database dsns / non-default layers", { + skip_if_not_installed("sf") + + # Can be tested using docker compose up with + # postgresql://localhost:5432/postgres?user=postgres&password=password + test_uri <- Sys.getenv("SEDONADB_POSTGRESQL_TEST_URI", unset = "") + if (identical(test_uri, "")) { + skip("SEDONADB_POSTGRESQL_TEST_URI is not set") + } + + nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") + sf::st_write( + sf::st_read(nc_gpkg, quiet = TRUE), + test_uri, + "test_sf_nc", + append = FALSE, + driver = "PostgreSQL", + quiet = TRUE + ) + + from_stream <- sf::st_as_sf(sd_read_sf(test_uri, "test_sf_nc")) + from_sf <- sf::st_read(test_uri, "test_sf_nc", quiet = TRUE) + + # Expect identical CRS + expect_true(sf::st_crs(from_stream) == sf::st_crs(from_sf)) + + # Expect identical content without CRS + expect_equal( + from_stream |> sf::st_set_crs(NA) |> as.data.frame(), + from_sf |> sf::st_set_crs(NA) |> as.data.frame() + ) +}) + +test_that("sd_read_sf() works with filter", { + skip_if_not_installed("sf") + + nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") + filter <- wk::rct(-77.901, 36.162, -77.075, 36.556, crs = sf::st_crs("NAD27")) + + from_stream <- sf::st_as_sf(sd_read_sf(nc_gpkg, filter = filter)) + from_sf <- sf::st_read(nc_gpkg, quiet = TRUE, wkt_filter = wk::as_wkt(filter)) + + # Expect identical CRS + expect_true(sf::st_crs(from_stream) == sf::st_crs(from_sf)) + + # Expect identical content without CRS + expect_equal( + from_stream |> sf::st_set_crs(NA) |> as.data.frame(), + from_sf |> sf::st_set_crs(NA) |> as.data.frame() + ) + + # Check for error if filtered with an invalid CRS + wk::wk_crs(filter) <- NULL + expect_snapshot_error(sd_read_sf(nc_gpkg, filter = filter)) +}) + +test_that("sd_read_sf() works for zipped dsns", { + skip_if_not_installed("sf") + + fgb <- system.file("files/natural-earth_cities.fgb", package = "sedonadb") + fgb_zip <- paste0(fgb, ".zip") + + from_stream_fgb <- sd_read_sf(fgb) |> sf::st_as_sf() + from_stream_fgb_zip <- sd_read_sf(fgb_zip) |> sf::st_as_sf() + expect_identical(from_stream_fgb_zip, from_stream_fgb) +}) + +test_that("sd_read_sf() works for URL dsns", { + skip_on_cran() + skip_if_not_installed("sf") + + # nolint start: line_length_linter + url <- "https://github.com/geoarrow/geoarrow-data/releases/download/v0.2.0/ns-water_water-point.fgb" + # nolint end Review Comment: This test hits GitHub URLs directly and only skips on CRAN. That can still be flaky in CI/offline environments. Consider adding `skip_if_offline()` (testthat) or an equivalent network skip to avoid non-deterministic failures. ########## r/sedonadb/R/datasource.R: ########## @@ -0,0 +1,227 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read GDAL/OGR via the sf package +#' +#' Uses the ArrowArrayStream interface to GDAL exposed via the sf package +#' to read GDAL/OGR-based data sources. +#' +#' @param ctx A SedonaDB context created using [sd_connect()]. +#' @param dsn,layer Description of datasource and layer. See [sf::read_sf()] +#' for details. +#' @param ... Currently unused and must be empty +#' @param query A SQL query to pass on to GDAL/OGR. +#' @param options A character vector with layer open options in the +#' form "KEY=VALUE". +#' @param drivers A list of drivers to try if the dsn cannot be guessed. +#' @param filter A spatial object that may be used to filter while reading. +#' In the future SedonaDB will automatically calculate this value based on +#' the query. May be any spatial object that can be converted to WKT via +#' [wk::as_wkt()]. This filter's CRS must match that of the data. +#' @param fid_column_name An optional name for the feature id (FID) column. +#' @param lazy Use `TRUE` to stream the data from the source rather than collect +#' first. This can be faster for large data sources but can also be confusing +#' because the data may only be scanned exactly once. +#' +#' @returns A SedonaDB DataFrame. +#' @export +#' +#' @examples +#' nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") +#' sd_read_sf(nc_gpkg) +#' +sd_read_sf <- function( + dsn, + layer = NULL, + ..., + query = NA, + options = NULL, + drivers = NULL, + filter = NULL, + fid_column_name = NULL, + lazy = FALSE +) { + sd_ctx_read_sf( + ctx(), + dsn = dsn, + layer = layer, + ..., + query = query, + options = options, + drivers = drivers, + filter = filter, + fid_column_name = fid_column_name, + lazy = lazy + ) +} + +#' @rdname sd_read_sf +#' @export +sd_ctx_read_sf <- function( + ctx, + dsn, + layer = NULL, + ..., + query = NA, + options = NULL, + drivers = NULL, + filter = NULL, + fid_column_name = NULL, + lazy = FALSE +) { + stream <- read_sf_stream( + dsn = dsn, + layer = layer, + ..., + query = query, + options = options, + drivers = drivers, + filter = filter, + fid_column_name = fid_column_name + ) + + df <- ctx$data_frame_from_array_stream(stream, collect_now = !lazy) + new_sedonadb_dataframe(ctx, df) +} + + +read_sf_stream <- function( + dsn, + layer = NULL, + ..., + query = NA, + options = NULL, + drivers = NULL, + filter = NULL, + fid_column_name = NULL +) { + check_dots_empty(..., label = "sd_read_sf_stream()") + + if (is.null(layer)) { + layer <- character(0) + } else { + layer <- enc2utf8(layer) + } + + if (nchar(dsn) == 0) { + stop("Expected non-empty value for dsn") + } + Review Comment: `if (nchar(dsn) == 0)` will error when `dsn` has length > 1 (condition has length > 1). Since GDAL DSNs are expected to be scalar, validate `length(dsn) == 1` (and type) before calling `nchar()`/`file.exists()` and pass a clear error when it isn’t. ########## r/sedonadb/R/datasource.R: ########## @@ -0,0 +1,227 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read GDAL/OGR via the sf package +#' +#' Uses the ArrowArrayStream interface to GDAL exposed via the sf package +#' to read GDAL/OGR-based data sources. +#' +#' @param ctx A SedonaDB context created using [sd_connect()]. +#' @param dsn,layer Description of datasource and layer. See [sf::read_sf()] +#' for details. +#' @param ... Currently unused and must be empty +#' @param query A SQL query to pass on to GDAL/OGR. +#' @param options A character vector with layer open options in the +#' form "KEY=VALUE". +#' @param drivers A list of drivers to try if the dsn cannot be guessed. +#' @param filter A spatial object that may be used to filter while reading. +#' In the future SedonaDB will automatically calculate this value based on +#' the query. May be any spatial object that can be converted to WKT via +#' [wk::as_wkt()]. This filter's CRS must match that of the data. +#' @param fid_column_name An optional name for the feature id (FID) column. +#' @param lazy Use `TRUE` to stream the data from the source rather than collect +#' first. This can be faster for large data sources but can also be confusing +#' because the data may only be scanned exactly once. +#' +#' @returns A SedonaDB DataFrame. +#' @export +#' +#' @examples +#' nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") +#' sd_read_sf(nc_gpkg) +#' +sd_read_sf <- function( + dsn, + layer = NULL, + ..., + query = NA, + options = NULL, + drivers = NULL, + filter = NULL, + fid_column_name = NULL, + lazy = FALSE +) { + sd_ctx_read_sf( + ctx(), + dsn = dsn, + layer = layer, + ..., + query = query, + options = options, + drivers = drivers, + filter = filter, + fid_column_name = fid_column_name, + lazy = lazy + ) +} + +#' @rdname sd_read_sf +#' @export +sd_ctx_read_sf <- function( + ctx, + dsn, + layer = NULL, + ..., + query = NA, + options = NULL, + drivers = NULL, + filter = NULL, + fid_column_name = NULL, + lazy = FALSE +) { + stream <- read_sf_stream( + dsn = dsn, + layer = layer, + ..., + query = query, + options = options, + drivers = drivers, + filter = filter, + fid_column_name = fid_column_name + ) + + df <- ctx$data_frame_from_array_stream(stream, collect_now = !lazy) + new_sedonadb_dataframe(ctx, df) +} + + +read_sf_stream <- function( + dsn, + layer = NULL, + ..., + query = NA, + options = NULL, + drivers = NULL, + filter = NULL, + fid_column_name = NULL +) { + check_dots_empty(..., label = "sd_read_sf_stream()") + + if (is.null(layer)) { + layer <- character(0) + } else { + layer <- enc2utf8(layer) + } + + if (nchar(dsn) == 0) { + stop("Expected non-empty value for dsn") + } + + dsn_exists <- file.exists(dsn) + + # A heuristic to catch common database DSNs so that we don't try to normalize + # them as file paths + dsn_isdb <- grepl("^(pg|mssql|pgeo|odbc|postgresql):", tolower(dsn)) + dsn_is_http <- grepl("^https://", dsn) Review Comment: HTTP URLs are only detected with `grepl("^https://", dsn)`, so `http://...` inputs won’t be prefixed with `/vsicurl/` (unlike the Python bindings which handle both http and https). Consider matching `^https?://` so plain HTTP URLs work too. ```suggestion dsn_is_http <- grepl("^https?://", dsn) ``` ########## r/sedonadb/man/apply_crses_to_sf_stream.Rd: ########## @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/000-wrappers.R +\name{apply_crses_to_sf_stream} +\alias{apply_crses_to_sf_stream} +\title{This is a workaround for the current availability of the GDAL stream interface +as exposed by the sf package. With} +\usage{ +apply_crses_to_sf_stream( + stream_in_xptr, + geometry_column_names, + geometry_column_crses, + stream_out_xptr +) +} +\description{ +This is a workaround for the current availability of the GDAL stream interface +as exposed by the sf package. With +} Review Comment: The generated Rd has an incomplete title/description (both end with “With”), which will produce confusing help output. Either remove this man page if the helper is internal-only, or complete the roxygen so the title/description are full sentences. ########## r/sedonadb/tests/testthat/test-datasource.R: ########## @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +test_that("sd_read_sf() works for layers with named geometry columns", { + skip_if_not_installed("sf") + + nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") + + from_stream <- sf::st_as_sf(sd_read_sf(nc_gpkg)) + from_sf <- sf::st_read(nc_gpkg, quiet = TRUE) + + # Expect identical CRS + expect_true(sf::st_crs(from_stream) == sf::st_crs(from_sf)) + + # Expect identical content without CRS + expect_equal( + from_stream |> sf::st_set_crs(NA) |> as.data.frame(), + from_sf |> sf::st_set_crs(NA) |> as.data.frame() + ) +}) + +test_that("sd_read_sf() works for layers with unnamed geometry columns", { + skip_if_not_installed("sf") + + nc_shp <- system.file("shape/nc.shp", package = "sf") + + from_stream <- sf::st_as_sf(sd_read_sf(nc_shp)) + from_sf <- sf::st_read(nc_shp, quiet = TRUE, promote_to_multi = FALSE) + + # Expect identical CRS + expect_true(sf::st_crs(from_stream) == sf::st_crs(from_sf)) + + # The from_stream version has a geometry column named "wkb_geometry" but + # sf renames this internally to "geometry" + expect_true("wkb_geometry" %in% names(from_stream)) + colnames(from_stream)[colnames(from_stream) == "wkb_geometry"] <- "geometry" + sf::st_geometry(from_stream) <- "geometry" + + # Expect identical content without CRS + expect_equal( + from_stream |> sf::st_set_crs(NA) |> as.data.frame(), + from_sf |> sf::st_set_crs(NA) |> as.data.frame() + ) +}) + +test_that("sd_read_sf() works for database dsns / non-default layers", { + skip_if_not_installed("sf") + + # Can be tested using docker compose up with + # postgresql://localhost:5432/postgres?user=postgres&password=password + test_uri <- Sys.getenv("SEDONADB_POSTGRESQL_TEST_URI", unset = "") + if (identical(test_uri, "")) { + skip("SEDONADB_POSTGRESQL_TEST_URI is not set") + } + + nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") + sf::st_write( + sf::st_read(nc_gpkg, quiet = TRUE), + test_uri, + "test_sf_nc", + append = FALSE, + driver = "PostgreSQL", + quiet = TRUE + ) + + from_stream <- sf::st_as_sf(sd_read_sf(test_uri, "test_sf_nc")) + from_sf <- sf::st_read(test_uri, "test_sf_nc", quiet = TRUE) + + # Expect identical CRS + expect_true(sf::st_crs(from_stream) == sf::st_crs(from_sf)) + + # Expect identical content without CRS + expect_equal( + from_stream |> sf::st_set_crs(NA) |> as.data.frame(), + from_sf |> sf::st_set_crs(NA) |> as.data.frame() + ) +}) + +test_that("sd_read_sf() works with filter", { + skip_if_not_installed("sf") + + nc_gpkg <- system.file("gpkg/nc.gpkg", package = "sf") + filter <- wk::rct(-77.901, 36.162, -77.075, 36.556, crs = sf::st_crs("NAD27")) + + from_stream <- sf::st_as_sf(sd_read_sf(nc_gpkg, filter = filter)) Review Comment: These tests call `wk::...` (and `sd_read_sf()` itself uses `wk::`), but the only dependency guard is `skip_if_not_installed("sf")`. Since `wk` is in Suggests, this will fail on environments without wk; add `skip_if_not_installed("wk")` in the relevant tests (or a shared helper at the top of the file). -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
