GitHub user e-kotov added a comment to the discussion: Observations from R and Python benchmarks: performance bottlenecks and optimization ideas for sedona-db
While sedonaddb-r does not have a data-reader, I implemented a duckdb-based reader that passes the data over to SedonaDB via Arrow in https://github.com/e-kotov/sx A script for benchmark: ```r install.packages( 'sedonadb', repos = c('https://apache.r-universe.dev', 'https://cloud.r-project.org') ) pak::pak("e-kotov/sx") library(sx) library(sf) library(dplyr) library(microbenchmark) library(spData) library(sedonadb) # --- Data Preparation --- message("Preparing data...") data(nz, package = "spData") nz <- st_make_valid(nz) # Check/Transform CRS (EPSG:2193) if (st_crs(nz)$epsg != 2193) { nz <- st_transform(nz, 2193) } # Create temp files using our package helper regions_path <- sx:::sx_create_temp_spatial_file(nz, ext = "gpkg") n_points <- 100000 message("Generating ", n_points, " random points in NZ bbox...") set.seed(42) bbox_poly <- st_as_sfc(st_bbox(nz)) points_sfc <- st_sample(bbox_poly, size = n_points) points_sf <- st_sf(geometry = points_sfc) if (st_crs(points_sf)$epsg != 2193) { st_crs(points_sf) <- 2193 } points_path <- sx:::sx_create_temp_spatial_file(points_sf, ext = "gpkg") message("Starting SX Materialization Benchmarks...") # Helper to log results log_result <- function(operation, benchmark) { mean_sec <- mean(benchmark$time) / 1e9 ops_per_sec <- 1 / mean_sec cat(sprintf("sx-package,R,%s,%.2f\n", operation, ops_per_sec)) } # 1. Lazy Load (Disk -> Lazy View) # sx_read stays lazy by default message("\n--- Loading Data (Lazy) ---") points <- sx_read(points_path, verbosity = "quiet") regions <- sx_read(regions_path, view_name = "regions", verbosity = "quiet") # 2. Buffer (1km) message("\n--- Benchmark: Buffer (1km) ---") # Scenario 1: Materialize to Sedona Table (Stay in DB) message("Scenario 1: Materialize to SedonaDB Table") bench_buffer_table <- microbenchmark( buffer_table = { x <- sx_buffer(points, dist = 1000, verbosity = "quiet") |> sedonadb::sd_compute() # force materialization }, times = 5 ) print(bench_buffer_table) log_result("buffer_table", bench_buffer_table) # Scenario 2: Materialize to R (Collect to sf) message("Scenario 2: Materialize to R (sf)") bench_buffer_collect <- microbenchmark( buffer_collect = { x <- sx_buffer("points", dist = 1000, verbosity = "quiet") |> sx_collect(verbosity = "quiet") }, times = 5 ) print(bench_buffer_collect) log_result("buffer_collect", bench_buffer_collect) # 3. Spatial Join (Intersects) message("\n--- Benchmark: Spatial Join (Intersects) ---") # Scenario 1: Materialize to Sedona Table message("Scenario 1: Materialize to SedonaDB Table") bench_sjoin_table <- microbenchmark( sjoin_table = { sx_join( points, regions, join = "intersects", left = TRUE, verbosity = "quiet" ) |> select(geom, Name) |> sedonadb::sd_compute() # force materialization }, times = 5 ) print(bench_sjoin_table) log_result("sjoin_table", bench_sjoin_table) # Scenario 2: Materialize to R (Collect to sf) message("Scenario 2: Materialize to R (sf)") bench_sjoin_collect <- microbenchmark( sjoin_collect = { sx_join( points, regions, join = "intersects", left = TRUE, verbosity = "quiet" ) |> select(geom, Name) |> sx_collect(verbosity = "quiet") }, times = 5 ) print(bench_sjoin_collect) log_result("sjoin_collect", bench_sjoin_collect) message("\nSX Materialization Benchmarks Complete.") ``` GitHub link: https://github.com/apache/sedona/discussions/2576#discussioncomment-15599883 ---- This is an automatically sent email for [email protected]. To unsubscribe, please send an email to: [email protected]
