(sedona-db) branch main updated: fix(rust/sedona): Strip schema metadata when input uses RecordBatchReaderProvider (#517)

paleolimbot Fri, 16 Jan 2026 07:23:58 -0800

This is an automated email from the ASF dual-hosted git repository.

paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git



The following commit(s) were added to refs/heads/main by this push:
     new 2535b299 fix(rust/sedona): Strip schema metadata when input uses 
RecordBatchReaderProvider (#517)
2535b299 is described below

commit 2535b299c27066e8fca0c0c16423fc291dbf23e8
Author: Dewey Dunnington <[email protected]>
AuthorDate: Fri Jan 16 09:23:47 2026 -0600

    fix(rust/sedona): Strip schema metadata when input uses 
RecordBatchReaderProvider (#517)
    
    Co-authored-by: Copilot <[email protected]>
---
 python/sedonadb/tests/test_sjoin.py             | 54 ++++++++++++++++++++++++-
 rust/sedona/src/record_batch_reader_provider.rs | 23 ++++++++++-
 2 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/python/sedonadb/tests/test_sjoin.py 
b/python/sedonadb/tests/test_sjoin.py
index 9169fbfb..e80c55fe 100644
--- a/python/sedonadb/tests/test_sjoin.py
+++ b/python/sedonadb/tests/test_sjoin.py
@@ -14,9 +14,16 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import pytest
+
 import json
+import warnings
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import pytest
 from sedonadb.testing import PostGIS, SedonaDB
+from shapely.geometry import Point
 
 
 @pytest.mark.parametrize(
@@ -394,3 +401,48 @@ def test_non_optimizable_subquery():
         sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
         assert len(sedonadb_results) > 0
         eng_postgis.assert_query_result(sql, sedonadb_results)
+
+
+def test_spatial_join_with_pandas_metadata(con):
+    # Previous versions of SedonaDB failed to execute this because of a 
mismatched
+    # schema. Attempts to simplify this reproducer weren't able to recreate the
+    # initial error (PhysicalOptimizer rule 'join_selection' failed).
+    # https://github.com/apache/sedona-db/issues/477
+
+    # 1. Generate Data
+    n_points = 1000
+    n_polys = 10
+
+    # Points
+    rng = np.random.Generator(np.random.MT19937(49791))
+    lons = rng.uniform(-6, 2, n_points)
+    lats = rng.uniform(50, 59, n_points)
+    pts_df = pd.DataFrame(
+        {"idx": range(n_points), "geometry": [Point(x, y) for x, y in 
zip(lons, lats)]}
+    )
+    pts_gdf = gpd.GeoDataFrame(pts_df, crs="EPSG:4326")
+
+    # Polygons (Centers buffered)
+    plons = rng.uniform(-6, 2, n_polys)
+    plats = rng.uniform(50, 59, n_polys)
+    poly_centers = gpd.GeoDataFrame(
+        {"geometry": [Point(x, y) for x, y in zip(plons, plats)]}, 
crs="EPSG:4326"
+    )
+    # Simple buffer in degrees (test data so we don't need the GeoPandas 
warning here)
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        polys_gdf = poly_centers.buffer(0.1).to_frame(name="geometry")
+
+    # 2. Load
+    con.create_data_frame(pts_gdf).to_view("points", overwrite=True)
+    con.create_data_frame(polys_gdf).to_view("polygons", overwrite=True)
+
+    # 3. Intersection
+    query = """
+        SELECT p.idx
+        FROM points AS p, polygons AS poly
+        WHERE ST_Intersects(p.geometry, poly.geometry)
+    """
+
+    res = con.sql(query).to_pandas()
+    pd.testing.assert_frame_equal(res, pd.DataFrame({"idx": [304, 342, 490, 
705]}))
diff --git a/rust/sedona/src/record_batch_reader_provider.rs 
b/rust/sedona/src/record_batch_reader_provider.rs
index e197f89d..3df47396 100644
--- a/rust/sedona/src/record_batch_reader_provider.rs
+++ b/rust/sedona/src/record_batch_reader_provider.rs
@@ -14,7 +14,8 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-use std::{any::Any, fmt::Debug, sync::Arc};
+
+use std::{any::Any, collections::HashMap, fmt::Debug, sync::Arc};
 
 use arrow_array::RecordBatchReader;
 use arrow_schema::SchemaRef;
@@ -49,8 +50,13 @@ pub struct RecordBatchReaderProvider {
 unsafe impl Sync for RecordBatchReaderProvider {}
 
 impl RecordBatchReaderProvider {
+    /// Create a new RecordBatchReaderProvider from an existing 
RecordBatchReader
+    ///
+    /// Schema metadata is stripped if provided. While schema metadata is 
supported
+    /// in theory in DataFusion, it causes issues with schema equivalence in 
some
+    /// corner cases: https://github.com/apache/sedona-db/issues/477.
     pub fn new(reader: Box<dyn RecordBatchReader + Send>) -> Self {
-        let schema = reader.schema();
+        let schema = schema_ref_strip_metadata(reader.schema());
         Self {
             reader: Mutex::new(Some(reader)),
             schema,
@@ -297,6 +303,19 @@ impl ExecutionPlan for RecordBatchReaderExec {
     }
 }
 
+/// Strips metadata from a SchemaRef if needed
+fn schema_ref_strip_metadata(schema_ref: SchemaRef) -> SchemaRef {
+    if schema_ref.metadata().is_empty() {
+        schema_ref
+    } else {
+        schema_ref
+            .as_ref()
+            .clone()
+            .with_metadata(HashMap::new())
+            .into()
+    }
+}
+
 #[cfg(test)]
 mod test {

(sedona-db) branch main updated: fix(rust/sedona): Strip schema metadata when input uses RecordBatchReaderProvider (#517)

Reply via email to