This is an automated email from the ASF dual-hosted git repository.
paleolimbot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 2535b299 fix(rust/sedona): Strip schema metadata when input uses
RecordBatchReaderProvider (#517)
2535b299 is described below
commit 2535b299c27066e8fca0c0c16423fc291dbf23e8
Author: Dewey Dunnington <[email protected]>
AuthorDate: Fri Jan 16 09:23:47 2026 -0600
fix(rust/sedona): Strip schema metadata when input uses
RecordBatchReaderProvider (#517)
Co-authored-by: Copilot <[email protected]>
---
python/sedonadb/tests/test_sjoin.py | 54 ++++++++++++++++++++++++-
rust/sedona/src/record_batch_reader_provider.rs | 23 ++++++++++-
2 files changed, 74 insertions(+), 3 deletions(-)
diff --git a/python/sedonadb/tests/test_sjoin.py
b/python/sedonadb/tests/test_sjoin.py
index 9169fbfb..e80c55fe 100644
--- a/python/sedonadb/tests/test_sjoin.py
+++ b/python/sedonadb/tests/test_sjoin.py
@@ -14,9 +14,16 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
-import pytest
+
import json
+import warnings
+
+import geopandas as gpd
+import numpy as np
+import pandas as pd
+import pytest
from sedonadb.testing import PostGIS, SedonaDB
+from shapely.geometry import Point
@pytest.mark.parametrize(
@@ -394,3 +401,48 @@ def test_non_optimizable_subquery():
sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
assert len(sedonadb_results) > 0
eng_postgis.assert_query_result(sql, sedonadb_results)
+
+
+def test_spatial_join_with_pandas_metadata(con):
+ # Previous versions of SedonaDB failed to execute this because of a
mismatched
+ # schema. Attempts to simplify this reproducer weren't able to recreate the
+ # initial error (PhysicalOptimizer rule 'join_selection' failed).
+ # https://github.com/apache/sedona-db/issues/477
+
+ # 1. Generate Data
+ n_points = 1000
+ n_polys = 10
+
+ # Points
+ rng = np.random.Generator(np.random.MT19937(49791))
+ lons = rng.uniform(-6, 2, n_points)
+ lats = rng.uniform(50, 59, n_points)
+ pts_df = pd.DataFrame(
+ {"idx": range(n_points), "geometry": [Point(x, y) for x, y in
zip(lons, lats)]}
+ )
+ pts_gdf = gpd.GeoDataFrame(pts_df, crs="EPSG:4326")
+
+ # Polygons (Centers buffered)
+ plons = rng.uniform(-6, 2, n_polys)
+ plats = rng.uniform(50, 59, n_polys)
+ poly_centers = gpd.GeoDataFrame(
+ {"geometry": [Point(x, y) for x, y in zip(plons, plats)]},
crs="EPSG:4326"
+ )
+ # Simple buffer in degrees (test data so we don't need the GeoPandas
warning here)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ polys_gdf = poly_centers.buffer(0.1).to_frame(name="geometry")
+
+ # 2. Load
+ con.create_data_frame(pts_gdf).to_view("points", overwrite=True)
+ con.create_data_frame(polys_gdf).to_view("polygons", overwrite=True)
+
+ # 3. Intersection
+ query = """
+ SELECT p.idx
+ FROM points AS p, polygons AS poly
+ WHERE ST_Intersects(p.geometry, poly.geometry)
+ """
+
+ res = con.sql(query).to_pandas()
+ pd.testing.assert_frame_equal(res, pd.DataFrame({"idx": [304, 342, 490,
705]}))
diff --git a/rust/sedona/src/record_batch_reader_provider.rs
b/rust/sedona/src/record_batch_reader_provider.rs
index e197f89d..3df47396 100644
--- a/rust/sedona/src/record_batch_reader_provider.rs
+++ b/rust/sedona/src/record_batch_reader_provider.rs
@@ -14,7 +14,8 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use std::{any::Any, fmt::Debug, sync::Arc};
+
+use std::{any::Any, collections::HashMap, fmt::Debug, sync::Arc};
use arrow_array::RecordBatchReader;
use arrow_schema::SchemaRef;
@@ -49,8 +50,13 @@ pub struct RecordBatchReaderProvider {
unsafe impl Sync for RecordBatchReaderProvider {}
impl RecordBatchReaderProvider {
+ /// Create a new RecordBatchReaderProvider from an existing
RecordBatchReader
+ ///
+ /// Schema metadata is stripped if provided. While schema metadata is
supported
+ /// in theory in DataFusion, it causes issues with schema equivalence in
some
+ /// corner cases: https://github.com/apache/sedona-db/issues/477.
pub fn new(reader: Box<dyn RecordBatchReader + Send>) -> Self {
- let schema = reader.schema();
+ let schema = schema_ref_strip_metadata(reader.schema());
Self {
reader: Mutex::new(Some(reader)),
schema,
@@ -297,6 +303,19 @@ impl ExecutionPlan for RecordBatchReaderExec {
}
}
+/// Strips metadata from a SchemaRef if needed
+fn schema_ref_strip_metadata(schema_ref: SchemaRef) -> SchemaRef {
+ if schema_ref.metadata().is_empty() {
+ schema_ref
+ } else {
+ schema_ref
+ .as_ref()
+ .clone()
+ .with_metadata(HashMap::new())
+ .into()
+ }
+}
+
#[cfg(test)]
mod test {