paleolimbot commented on code in PR #443: URL: https://github.com/apache/sedona-db/pull/443#discussion_r2615635786
########## rust/sedona-spatial-join/src/partitioning/flat.rs: ########## @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Flat (linear scan) spatial partitioner. +//! +//! This module provides a minimal partitioner that shares the same +//! intersection semantics as [`crate::partitioning::rtree::RTreePartitioner`] +//! but avoids the RTree indexing overhead. It stores partition boundaries +//! in a flat array and performs a linear scan to classify each query +//! bounding box. +//! +//! The partitioner follows the standard spatial partition semantics: +//! - Returns [`SpatialPartition::Regular`] when exactly one boundary +//! intersects the query bbox. +//! - Returns [`SpatialPartition::Multi`] when multiple boundaries +//! intersect the query bbox. +//! - Returns [`SpatialPartition::None`] when no boundary intersects +//! the query bbox. + +use datafusion_common::Result; +use geo::Rect; +use sedona_geometry::bounding_box::BoundingBox; + +use crate::partitioning::util::{bbox_to_geo_rect, rect_intersection_area, rects_intersect}; +use crate::partitioning::{SpatialPartition, SpatialPartitioner}; + +/// Spatial partitioner that linearly scans partition boundaries. +pub struct FlatPartitioner { + boundaries: Vec<Rect<f32>>, + num_partitions: usize, +} + +impl FlatPartitioner { + /// Create a new flat partitioner from explicit partition boundaries. + pub fn new(boundaries: Vec<BoundingBox>) -> Result<Self> { + let mut rects = Vec::with_capacity(boundaries.len()); + for bbox in boundaries { + rects.push(bbox_to_geo_rect(&bbox)?); + } + + let num_partitions = rects.len(); + + Ok(Self { + boundaries: rects, + num_partitions, + }) + } +} + +impl SpatialPartitioner for FlatPartitioner { + fn num_regular_partitions(&self) -> usize { + self.num_partitions + } + + fn partition(&self, bbox: &BoundingBox) -> Result<SpatialPartition> { + let query_rect = bbox_to_geo_rect(bbox)?; + let mut first_match = None; + for (idx, boundary) in self.boundaries.iter().enumerate() { + if rects_intersect(boundary, &query_rect) { + if first_match.is_some() { + return Ok(SpatialPartition::Multi); + } + first_match = Some(idx as u32); + } + } + + Ok(match first_match { + Some(id) => SpatialPartition::Regular(id), + None => SpatialPartition::None, + }) + } + + fn partition_no_multi(&self, bbox: &BoundingBox) -> Result<SpatialPartition> { + let query_rect = bbox_to_geo_rect(bbox)?; + let mut best_partition = None; + let mut best_area = -1.0_f32; + + for (idx, boundary) in self.boundaries.iter().enumerate() { + if rects_intersect(boundary, &query_rect) { + let area = rect_intersection_area(boundary, &query_rect); + if area > best_area { + best_area = area; + best_partition = Some(idx as u32); + } + } + } + + Ok(match best_partition { + Some(id) => SpatialPartition::Regular(id), + None => SpatialPartition::None, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn sample_partitions() -> Vec<BoundingBox> { + vec![ + BoundingBox::xy((0.0, 50.0), (0.0, 50.0)), + BoundingBox::xy((50.0, 100.0), (0.0, 50.0)), + BoundingBox::xy((0.0, 50.0), (50.0, 100.0)), + BoundingBox::xy((50.0, 100.0), (50.0, 100.0)), + ] + } + + #[test] + fn test_flat_partitioner_creation() { + let partitioner = FlatPartitioner::new(sample_partitions()).unwrap(); + assert_eq!(partitioner.num_regular_partitions(), 4); + } + + #[test] + fn test_flat_partitioner_regular() { + let partitioner = FlatPartitioner::new(sample_partitions()).unwrap(); + let bbox = BoundingBox::xy((10.0, 20.0), (10.0, 20.0)); + match partitioner.partition(&bbox).unwrap() { + SpatialPartition::Regular(id) => assert_eq!(id, 0), + _ => panic!("expected Regular partition"), + } Review Comment: Is there a reason this won't work? (Here and below) ```suggestion assert_eq!(partitioner.partition(&bbox).unwrap(), SpatialPartition::Regular(0)); ``` ########## rust/sedona-spatial-join/src/partitioning/flat.rs: ########## @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Flat (linear scan) spatial partitioner. +//! +//! This module provides a minimal partitioner that shares the same +//! intersection semantics as [`crate::partitioning::rtree::RTreePartitioner`] +//! but avoids the RTree indexing overhead. It stores partition boundaries +//! in a flat array and performs a linear scan to classify each query +//! bounding box. +//! +//! The partitioner follows the standard spatial partition semantics: +//! - Returns [`SpatialPartition::Regular`] when exactly one boundary +//! intersects the query bbox. +//! - Returns [`SpatialPartition::Multi`] when multiple boundaries +//! intersect the query bbox. +//! - Returns [`SpatialPartition::None`] when no boundary intersects +//! the query bbox. + +use datafusion_common::Result; +use geo::Rect; +use sedona_geometry::bounding_box::BoundingBox; + +use crate::partitioning::util::{bbox_to_geo_rect, rect_intersection_area, rects_intersect}; +use crate::partitioning::{SpatialPartition, SpatialPartitioner}; + +/// Spatial partitioner that linearly scans partition boundaries. +pub struct FlatPartitioner { + boundaries: Vec<Rect<f32>>, + num_partitions: usize, +} + +impl FlatPartitioner { + /// Create a new flat partitioner from explicit partition boundaries. + pub fn new(boundaries: Vec<BoundingBox>) -> Result<Self> { Review Comment: Are we passing around a sufficient number of these that it would be faster/use less memory to have a `BoundingBox2d`? ########## rust/sedona-spatial-join/src/partitioning/kdb.rs: ########## @@ -0,0 +1,843 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KDB-tree spatial partitioning implementation. +//! +//! This module provides a K-D-B tree (K-Dimensional B-tree) implementation for spatial +//! partitioning, which is essential for out-of-core spatial joins. +//! +//! # Partitioning Strategy +//! +//! - **Partition Assignment**: +//! - `partition()`: Returns `SpatialPartition::Regular(id)` if the bbox intersects exactly one partition, +//! `SpatialPartition::Multi` if it intersects multiple, or `SpatialPartition::None` if it intersects none. +//! - `partition_no_multi()`: Assigns the bbox to the partition with the largest intersection area +//! (Maximum Overlap Strategy), or `SpatialPartition::None` if no intersection. +//! +//! # Algorithm +//! +//! The KDB tree partitions space by recursively splitting it along alternating axes: +//! +//! 1. Start with the full spatial extent +//! 2. When a node exceeds `max_items_per_node`, split it: +//! - Choose the longer dimension (x or y) +//! - Sort items by their minimum coordinate on that dimension +//! - Split at the median item's coordinate +//! 3. Continue until reaching `max_levels` depth or items fit in nodes +//! 4. Assign sequential IDs to leaf nodes + +use std::sync::Arc; + +use crate::partitioning::{ + util::{bbox_to_geo_rect, rect_contains_point, rect_intersection_area, rects_intersect}, + SpatialPartition, SpatialPartitioner, +}; +use datafusion_common::Result; +use geo::{Coord, Rect}; +use sedona_geometry::bounding_box::BoundingBox; + +/// K-D-B tree spatial partitioner implementation. +/// +/// See https://en.wikipedia.org/wiki/K-D-B-tree +/// +/// The KDB tree is a hierarchical spatial data structure that recursively +/// partitions space using axis-aligned splits. It adapts to the spatial +/// distribution of data, making it effective for spatial partitioning. +/// +/// # Example +/// +/// ``` +/// use sedona_geometry::bounding_box::BoundingBox; +/// use sedona_spatial_join::partitioning::kdb::KDBPartitioner; +/// use sedona_spatial_join::partitioning::{SpatialPartitioner, SpatialPartition}; +/// +/// // Create sample bounding boxes +/// let bboxes = (0..20).map(|i| { +/// let x = (i % 10) as f64 * 10.0; +/// let y = (i / 10) as f64 * 10.0; +/// BoundingBox::xy((x, x + 5.0), (y, y + 5.0)) +/// }); +/// +/// // Build the KDB partitioner +/// let extent = BoundingBox::xy((0.0, 100.0), (0.0, 100.0)); +/// let partitioner = KDBPartitioner::build(bboxes, 5, 3, extent) +/// .expect("failed to build KDB partitioner"); +/// +/// // Query which partition a bounding box belongs to +/// let test_bbox = BoundingBox::xy((1.0, 4.0), (1.0, 4.0)); +/// match partitioner.partition(&test_bbox).unwrap() { +/// SpatialPartition::Regular(id) => println!("Partition ID: {}", id), +/// _ => unreachable!(), +/// } +/// ``` +pub(crate) struct KDBTree { + max_items_per_node: usize, + max_levels: usize, + extent: Rect<f32>, + level: usize, + items: Vec<Rect<f32>>, + children: Option<Box<[KDBTree; 2]>>, + leaf_id: u32, +} + +#[cfg(test)] +pub(crate) struct KDBResult { + extent: Rect<f32>, + leaf_id: u32, +} + +#[cfg(test)] +impl KDBResult { + fn new(kdb: &KDBTree) -> Self { + Self { + extent: kdb.extent, + leaf_id: kdb.leaf_id, + } + } +} + +impl KDBTree { + /// Create a new KDB tree with the given parameters. + /// + /// # Arguments + /// * `max_items_per_node` - Maximum number of items before splitting a node + /// * `max_levels` - Maximum depth of the tree + /// * `extent` - The spatial extent covered by this tree + pub fn new(max_items_per_node: usize, max_levels: usize, extent: BoundingBox) -> Result<Self> { + let extent_rect = bbox_to_geo_rect(&extent)?; + Ok(Self::new_with_level( + max_items_per_node, + max_levels, + 0, + extent_rect, + )) + } + + fn new_with_level( + max_items_per_node: usize, + max_levels: usize, + level: usize, + extent: Rect<f32>, + ) -> Self { + KDBTree { + max_items_per_node, + max_levels, + extent, + level, + items: Vec::new(), + children: None, + leaf_id: 0, + } + } + + /// Insert a bounding box into the tree. + pub fn insert(&mut self, bbox: BoundingBox) -> Result<()> { + let rect = bbox_to_geo_rect(&bbox)?; + self.insert_rect(rect); + Ok(()) + } + + fn insert_rect(&mut self, rect: Rect<f32>) { + if self.items.len() < self.max_items_per_node || self.level >= self.max_levels { + self.items.push(rect); + } else { + if self.children.is_none() { + // Split over longer side + let split_x = self.extent.width() > self.extent.height(); + let mut ok = self.split(split_x); + if !ok { + // Try splitting by the other side + ok = self.split(!split_x); + } + + if !ok { + // This could happen if all envelopes are the same. + self.items.push(rect); + return; + } + } + + // Insert into appropriate child + if let Some(ref mut children) = self.children { + let min_point = rect.min(); + for child in children.iter_mut() { + if rect_contains_point(child.extent(), &min_point) { + child.insert_rect(rect); + break; + } + } + } + } + } + + /// Check if this node is a leaf node. + pub fn is_leaf(&self) -> bool { + self.children.is_none() + } + + /// Get the leaf ID (only valid for leaf nodes). + pub fn leaf_id(&self) -> u32 { + assert!(self.is_leaf(), "leaf_id() called on non-leaf node"); + self.leaf_id + } + + /// Get the spatial extent of this node. + pub fn extent(&self) -> &Rect<f32> { + &self.extent + } + + /// Assign leaf IDs to all leaf nodes in the tree (breadth-first traversal). + pub fn assign_leaf_ids(&mut self) { + let mut next_id = 0; + self.assign_leaf_ids_recursive(&mut next_id); + } + + fn assign_leaf_ids_recursive(&mut self, next_id: &mut u32) { + if self.is_leaf() { + self.leaf_id = *next_id; + *next_id += 1; + } else if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.assign_leaf_ids_recursive(next_id); + } + } + } + + /// Find all leaf nodes that intersect with the given bounding box. + #[cfg(test)] + pub fn find_leaf_nodes(&self, rect: &Rect<f32>, matches: &mut Vec<KDBResult>) { + self.visit_intersecting_leaf_nodes(rect, &mut |kdb| { + matches.push(KDBResult::new(kdb)); + }); + } + + pub fn visit_intersecting_leaf_nodes<'a>( + &'a self, + rect: &Rect<f32>, + f: &mut impl FnMut(&'a KDBTree), + ) { + if !rects_intersect(&self.extent, rect) { + return; + } + + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_intersecting_leaf_nodes(rect, f); + } + } + } + + pub fn visit_leaf_nodes<'a>(&'a self, f: &mut impl FnMut(&'a KDBTree)) { + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_leaf_nodes(f); + } + } + } + + #[cfg(test)] + pub fn collect_leaf_nodes(&self) -> Vec<&KDBTree> { + let mut leaf_nodes = Vec::new(); + self.visit_leaf_nodes(&mut |kdb| { + leaf_nodes.push(kdb); + }); + leaf_nodes + } + + pub fn num_leaf_nodes(&self) -> usize { + let mut num = 0; + self.visit_leaf_nodes(&mut |_| { + num += 1; + }); + num + } + + /// Drop all stored items to save memory after tree construction. + pub fn drop_elements(&mut self) { + self.items.clear(); + if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.drop_elements(); + } + } + } + + /// Split this node along the specified axis. + /// Returns true if the split was successful, false otherwise. Review Comment: It might be worth briefly explaining here why this split would fail (it seems like it is mostly if you have 100% identical rects) ########## rust/sedona-spatial-join/src/partitioning/kdb.rs: ########## @@ -0,0 +1,843 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KDB-tree spatial partitioning implementation. +//! +//! This module provides a K-D-B tree (K-Dimensional B-tree) implementation for spatial +//! partitioning, which is essential for out-of-core spatial joins. +//! +//! # Partitioning Strategy +//! +//! - **Partition Assignment**: +//! - `partition()`: Returns `SpatialPartition::Regular(id)` if the bbox intersects exactly one partition, +//! `SpatialPartition::Multi` if it intersects multiple, or `SpatialPartition::None` if it intersects none. +//! - `partition_no_multi()`: Assigns the bbox to the partition with the largest intersection area +//! (Maximum Overlap Strategy), or `SpatialPartition::None` if no intersection. +//! +//! # Algorithm +//! +//! The KDB tree partitions space by recursively splitting it along alternating axes: +//! +//! 1. Start with the full spatial extent +//! 2. When a node exceeds `max_items_per_node`, split it: +//! - Choose the longer dimension (x or y) +//! - Sort items by their minimum coordinate on that dimension +//! - Split at the median item's coordinate +//! 3. Continue until reaching `max_levels` depth or items fit in nodes +//! 4. Assign sequential IDs to leaf nodes + +use std::sync::Arc; + +use crate::partitioning::{ + util::{bbox_to_geo_rect, rect_contains_point, rect_intersection_area, rects_intersect}, + SpatialPartition, SpatialPartitioner, +}; +use datafusion_common::Result; +use geo::{Coord, Rect}; +use sedona_geometry::bounding_box::BoundingBox; + +/// K-D-B tree spatial partitioner implementation. +/// +/// See https://en.wikipedia.org/wiki/K-D-B-tree +/// +/// The KDB tree is a hierarchical spatial data structure that recursively +/// partitions space using axis-aligned splits. It adapts to the spatial +/// distribution of data, making it effective for spatial partitioning. +/// +/// # Example +/// +/// ``` +/// use sedona_geometry::bounding_box::BoundingBox; +/// use sedona_spatial_join::partitioning::kdb::KDBPartitioner; +/// use sedona_spatial_join::partitioning::{SpatialPartitioner, SpatialPartition}; +/// +/// // Create sample bounding boxes +/// let bboxes = (0..20).map(|i| { +/// let x = (i % 10) as f64 * 10.0; +/// let y = (i / 10) as f64 * 10.0; +/// BoundingBox::xy((x, x + 5.0), (y, y + 5.0)) +/// }); +/// +/// // Build the KDB partitioner +/// let extent = BoundingBox::xy((0.0, 100.0), (0.0, 100.0)); +/// let partitioner = KDBPartitioner::build(bboxes, 5, 3, extent) +/// .expect("failed to build KDB partitioner"); +/// +/// // Query which partition a bounding box belongs to +/// let test_bbox = BoundingBox::xy((1.0, 4.0), (1.0, 4.0)); +/// match partitioner.partition(&test_bbox).unwrap() { +/// SpatialPartition::Regular(id) => println!("Partition ID: {}", id), +/// _ => unreachable!(), +/// } +/// ``` +pub(crate) struct KDBTree { + max_items_per_node: usize, + max_levels: usize, + extent: Rect<f32>, + level: usize, + items: Vec<Rect<f32>>, + children: Option<Box<[KDBTree; 2]>>, + leaf_id: u32, +} + +#[cfg(test)] +pub(crate) struct KDBResult { + extent: Rect<f32>, + leaf_id: u32, +} + +#[cfg(test)] +impl KDBResult { + fn new(kdb: &KDBTree) -> Self { + Self { + extent: kdb.extent, + leaf_id: kdb.leaf_id, + } + } +} + +impl KDBTree { + /// Create a new KDB tree with the given parameters. + /// + /// # Arguments + /// * `max_items_per_node` - Maximum number of items before splitting a node + /// * `max_levels` - Maximum depth of the tree + /// * `extent` - The spatial extent covered by this tree + pub fn new(max_items_per_node: usize, max_levels: usize, extent: BoundingBox) -> Result<Self> { + let extent_rect = bbox_to_geo_rect(&extent)?; + Ok(Self::new_with_level( + max_items_per_node, + max_levels, + 0, + extent_rect, + )) + } + + fn new_with_level( + max_items_per_node: usize, + max_levels: usize, + level: usize, + extent: Rect<f32>, + ) -> Self { + KDBTree { + max_items_per_node, + max_levels, + extent, + level, + items: Vec::new(), + children: None, + leaf_id: 0, + } + } + + /// Insert a bounding box into the tree. + pub fn insert(&mut self, bbox: BoundingBox) -> Result<()> { + let rect = bbox_to_geo_rect(&bbox)?; + self.insert_rect(rect); + Ok(()) + } + + fn insert_rect(&mut self, rect: Rect<f32>) { + if self.items.len() < self.max_items_per_node || self.level >= self.max_levels { + self.items.push(rect); + } else { + if self.children.is_none() { + // Split over longer side + let split_x = self.extent.width() > self.extent.height(); + let mut ok = self.split(split_x); + if !ok { + // Try splitting by the other side + ok = self.split(!split_x); + } + + if !ok { + // This could happen if all envelopes are the same. + self.items.push(rect); + return; + } + } + + // Insert into appropriate child + if let Some(ref mut children) = self.children { + let min_point = rect.min(); + for child in children.iter_mut() { + if rect_contains_point(child.extent(), &min_point) { + child.insert_rect(rect); + break; + } + } + } + } + } + + /// Check if this node is a leaf node. + pub fn is_leaf(&self) -> bool { + self.children.is_none() + } + + /// Get the leaf ID (only valid for leaf nodes). + pub fn leaf_id(&self) -> u32 { + assert!(self.is_leaf(), "leaf_id() called on non-leaf node"); + self.leaf_id + } + + /// Get the spatial extent of this node. + pub fn extent(&self) -> &Rect<f32> { + &self.extent + } + + /// Assign leaf IDs to all leaf nodes in the tree (breadth-first traversal). + pub fn assign_leaf_ids(&mut self) { + let mut next_id = 0; + self.assign_leaf_ids_recursive(&mut next_id); + } + + fn assign_leaf_ids_recursive(&mut self, next_id: &mut u32) { + if self.is_leaf() { + self.leaf_id = *next_id; + *next_id += 1; + } else if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.assign_leaf_ids_recursive(next_id); + } + } + } + + /// Find all leaf nodes that intersect with the given bounding box. + #[cfg(test)] + pub fn find_leaf_nodes(&self, rect: &Rect<f32>, matches: &mut Vec<KDBResult>) { + self.visit_intersecting_leaf_nodes(rect, &mut |kdb| { + matches.push(KDBResult::new(kdb)); + }); + } + + pub fn visit_intersecting_leaf_nodes<'a>( + &'a self, + rect: &Rect<f32>, + f: &mut impl FnMut(&'a KDBTree), + ) { + if !rects_intersect(&self.extent, rect) { + return; + } + + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_intersecting_leaf_nodes(rect, f); + } + } + } + + pub fn visit_leaf_nodes<'a>(&'a self, f: &mut impl FnMut(&'a KDBTree)) { + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_leaf_nodes(f); + } + } + } + + #[cfg(test)] + pub fn collect_leaf_nodes(&self) -> Vec<&KDBTree> { + let mut leaf_nodes = Vec::new(); + self.visit_leaf_nodes(&mut |kdb| { + leaf_nodes.push(kdb); + }); + leaf_nodes + } + + pub fn num_leaf_nodes(&self) -> usize { + let mut num = 0; + self.visit_leaf_nodes(&mut |_| { + num += 1; + }); + num + } + + /// Drop all stored items to save memory after tree construction. + pub fn drop_elements(&mut self) { + self.items.clear(); + if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.drop_elements(); + } + } + } + + /// Split this node along the specified axis. + /// Returns true if the split was successful, false otherwise. + fn split(&mut self, split_x: bool) -> bool { + // Sort items by the appropriate coordinate + if split_x { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } else { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } + + // Find the split coordinate from the middle item + let middle_idx = self.items.len() / 2; + let middle_min = self.items[middle_idx].min(); + + let split_coord = if split_x { middle_min.x } else { middle_min.y }; + + let extent_min = self.extent.min(); + let extent_max = self.extent.max(); + + // Check if split coordinate is valid + if split_x { + if split_coord <= extent_min.x || split_coord >= extent_max.x { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + } else if split_coord <= extent_min.y || split_coord >= extent_max.y { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + + // Create child extents + let (left_extent, right_extent) = if split_x { + let left = Rect::new( + extent_min, + Coord { + x: split_coord, + y: extent_max.y, + }, + ); + let right = Rect::new( + Coord { + x: split_coord, + y: extent_min.y, + }, + extent_max, + ); + (left, right) + } else { + let left = Rect::new( + extent_min, + Coord { + x: extent_max.x, + y: split_coord, + }, + ); + let right = Rect::new( + Coord { + x: extent_min.x, + y: split_coord, + }, + extent_max, + ); + (left, right) + }; + + // Create children + let mut left_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + left_extent, + ); + let mut right_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + right_extent, + ); + + // Distribute items to children + for item in self.items.drain(..) { + let belongs_to_left = if split_x { + item.min().x <= split_coord + } else { + item.min().y <= split_coord + }; + + if belongs_to_left { + left_child.insert_rect(item); + } else { + right_child.insert_rect(item); + } + } + + self.children = Some(Box::new([left_child, right_child])); + true + } +} + +/// KDB tree partitioner that implements the SpatialPartitioner trait. +/// +/// # Example Usage +/// +/// ```rust +/// use sedona_geometry::bounding_box::BoundingBox; +/// use sedona_spatial_join::partitioning::kdb::KDBPartitioner; +/// use sedona_spatial_join::partitioning::SpatialPartitioner; +/// +/// // Sample bounding boxes from your data +/// let bboxes = vec![ +/// BoundingBox::xy((0.0, 10.0), (0.0, 10.0)), +/// BoundingBox::xy((10.0, 20.0), (0.0, 10.0)), +/// BoundingBox::xy((20.0, 30.0), (0.0, 10.0)), +/// ]; +/// +/// // Build partitioner +/// let extent = BoundingBox::xy((0.0, 100.0), (0.0, 100.0)); +/// let partitioner = KDBPartitioner::build( +/// bboxes.into_iter(), +/// 10, // max_items_per_node +/// 4, // max_levels +/// extent +/// ).expect("failed to build KDB partitioner"); +/// +/// // Query partition for a new bbox +/// let query_bbox = BoundingBox::xy((5.0, 15.0), (5.0, 15.0)); +/// let partition = partitioner.partition(&query_bbox).unwrap(); +/// ``` +pub struct KDBPartitioner { + tree: Arc<KDBTree>, +} + +impl KDBPartitioner { + /// Create a new KDB partitioner from a KDB tree. + /// + /// The tree should already be built and have leaf IDs assigned. + pub(crate) fn new(tree: Arc<KDBTree>) -> Self { + KDBPartitioner { tree } + } + + /// Build a KDB tree from a collection of bounding boxes. + /// + /// # Arguments + /// * `bboxes` - Iterator of bounding boxes to partition + /// * `max_items_per_node` - Maximum items per node before splitting + /// * `max_levels` - Maximum tree depth + /// * `extent` - The spatial extent to partition + pub fn build( + bboxes: impl Iterator<Item = BoundingBox>, + max_items_per_node: usize, + max_levels: usize, + extent: BoundingBox, + ) -> Result<Self> { + let mut tree = KDBTree::new(max_items_per_node, max_levels, extent)?; + + for bbox in bboxes { + tree.insert(bbox)?; + } + + tree.assign_leaf_ids(); + tree.drop_elements(); + + Ok(Self::new(Arc::new(tree))) + } + + /// Get the number of leaf partitions in the tree. + pub fn num_partitions(&self) -> usize { + self.tree.num_leaf_nodes() + } +} + +impl SpatialPartitioner for KDBPartitioner { + fn num_regular_partitions(&self) -> usize { + self.num_partitions() + } + + fn partition(&self, bbox: &BoundingBox) -> Result<SpatialPartition> { + let rect = bbox_to_geo_rect(bbox)?; + let mut sp: Option<SpatialPartition> = None; + + self.tree.visit_intersecting_leaf_nodes(&rect, &mut |kdb| { + if sp.is_none() { + sp = Some(SpatialPartition::Regular(kdb.leaf_id())); + } else { + sp = Some(SpatialPartition::Multi) + } + }); + + Ok(sp.unwrap_or(SpatialPartition::None)) + } + + fn partition_no_multi(&self, bbox: &BoundingBox) -> Result<SpatialPartition> { + let rect = bbox_to_geo_rect(bbox)?; + + let mut best_leaf_id: Option<u32> = None; + let mut max_area = 0.0_f32; + + self.tree.visit_intersecting_leaf_nodes(&rect, &mut |kdb| { + let area = rect_intersection_area(&rect, kdb.extent()); + if best_leaf_id.is_none() || area > max_area { + best_leaf_id = Some(kdb.leaf_id()); + max_area = area; + } + }); + + match best_leaf_id { + Some(leaf_id) => Ok(SpatialPartition::Regular(leaf_id)), + None => Ok(SpatialPartition::None), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_kdb_insert_and_leaf_assignment() -> Result<()> { Review Comment: Is there a reason these tests return Result instead of using `unwrap()`? (I tend to like the failures by `unwrap()` but DataFusion seems to like returning results) ########## rust/sedona-spatial-join/src/partitioning/flat.rs: ########## @@ -0,0 +1,177 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Flat (linear scan) spatial partitioner. +//! +//! This module provides a minimal partitioner that shares the same +//! intersection semantics as [`crate::partitioning::rtree::RTreePartitioner`] +//! but avoids the RTree indexing overhead. It stores partition boundaries Review Comment: If you happen to know how many partition boundaries it takes to make the RTree worth it, it might be nice to add it to this comment ########## rust/sedona-spatial-join/src/partitioning/kdb.rs: ########## @@ -0,0 +1,843 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KDB-tree spatial partitioning implementation. +//! +//! This module provides a K-D-B tree (K-Dimensional B-tree) implementation for spatial +//! partitioning, which is essential for out-of-core spatial joins. +//! +//! # Partitioning Strategy +//! +//! - **Partition Assignment**: +//! - `partition()`: Returns `SpatialPartition::Regular(id)` if the bbox intersects exactly one partition, +//! `SpatialPartition::Multi` if it intersects multiple, or `SpatialPartition::None` if it intersects none. +//! - `partition_no_multi()`: Assigns the bbox to the partition with the largest intersection area +//! (Maximum Overlap Strategy), or `SpatialPartition::None` if no intersection. +//! +//! # Algorithm +//! +//! The KDB tree partitions space by recursively splitting it along alternating axes: +//! +//! 1. Start with the full spatial extent +//! 2. When a node exceeds `max_items_per_node`, split it: +//! - Choose the longer dimension (x or y) +//! - Sort items by their minimum coordinate on that dimension +//! - Split at the median item's coordinate +//! 3. Continue until reaching `max_levels` depth or items fit in nodes +//! 4. Assign sequential IDs to leaf nodes + +use std::sync::Arc; + +use crate::partitioning::{ + util::{bbox_to_geo_rect, rect_contains_point, rect_intersection_area, rects_intersect}, + SpatialPartition, SpatialPartitioner, +}; +use datafusion_common::Result; +use geo::{Coord, Rect}; +use sedona_geometry::bounding_box::BoundingBox; + +/// K-D-B tree spatial partitioner implementation. +/// +/// See https://en.wikipedia.org/wiki/K-D-B-tree +/// +/// The KDB tree is a hierarchical spatial data structure that recursively +/// partitions space using axis-aligned splits. It adapts to the spatial +/// distribution of data, making it effective for spatial partitioning. +/// +/// # Example +/// +/// ``` +/// use sedona_geometry::bounding_box::BoundingBox; +/// use sedona_spatial_join::partitioning::kdb::KDBPartitioner; +/// use sedona_spatial_join::partitioning::{SpatialPartitioner, SpatialPartition}; +/// +/// // Create sample bounding boxes +/// let bboxes = (0..20).map(|i| { +/// let x = (i % 10) as f64 * 10.0; +/// let y = (i / 10) as f64 * 10.0; +/// BoundingBox::xy((x, x + 5.0), (y, y + 5.0)) +/// }); +/// +/// // Build the KDB partitioner +/// let extent = BoundingBox::xy((0.0, 100.0), (0.0, 100.0)); +/// let partitioner = KDBPartitioner::build(bboxes, 5, 3, extent) +/// .expect("failed to build KDB partitioner"); +/// +/// // Query which partition a bounding box belongs to +/// let test_bbox = BoundingBox::xy((1.0, 4.0), (1.0, 4.0)); +/// match partitioner.partition(&test_bbox).unwrap() { +/// SpatialPartition::Regular(id) => println!("Partition ID: {}", id), +/// _ => unreachable!(), +/// } +/// ``` +pub(crate) struct KDBTree { + max_items_per_node: usize, + max_levels: usize, + extent: Rect<f32>, + level: usize, + items: Vec<Rect<f32>>, + children: Option<Box<[KDBTree; 2]>>, + leaf_id: u32, +} + +#[cfg(test)] +pub(crate) struct KDBResult { + extent: Rect<f32>, + leaf_id: u32, +} + +#[cfg(test)] +impl KDBResult { + fn new(kdb: &KDBTree) -> Self { + Self { + extent: kdb.extent, + leaf_id: kdb.leaf_id, + } + } +} + +impl KDBTree { + /// Create a new KDB tree with the given parameters. + /// + /// # Arguments + /// * `max_items_per_node` - Maximum number of items before splitting a node + /// * `max_levels` - Maximum depth of the tree + /// * `extent` - The spatial extent covered by this tree + pub fn new(max_items_per_node: usize, max_levels: usize, extent: BoundingBox) -> Result<Self> { + let extent_rect = bbox_to_geo_rect(&extent)?; + Ok(Self::new_with_level( + max_items_per_node, + max_levels, + 0, + extent_rect, + )) + } + + fn new_with_level( + max_items_per_node: usize, + max_levels: usize, + level: usize, + extent: Rect<f32>, + ) -> Self { + KDBTree { + max_items_per_node, + max_levels, + extent, + level, + items: Vec::new(), + children: None, + leaf_id: 0, + } + } + + /// Insert a bounding box into the tree. + pub fn insert(&mut self, bbox: BoundingBox) -> Result<()> { + let rect = bbox_to_geo_rect(&bbox)?; + self.insert_rect(rect); + Ok(()) + } + + fn insert_rect(&mut self, rect: Rect<f32>) { + if self.items.len() < self.max_items_per_node || self.level >= self.max_levels { + self.items.push(rect); + } else { + if self.children.is_none() { + // Split over longer side + let split_x = self.extent.width() > self.extent.height(); + let mut ok = self.split(split_x); + if !ok { + // Try splitting by the other side + ok = self.split(!split_x); + } + + if !ok { + // This could happen if all envelopes are the same. + self.items.push(rect); + return; + } + } + + // Insert into appropriate child + if let Some(ref mut children) = self.children { + let min_point = rect.min(); + for child in children.iter_mut() { + if rect_contains_point(child.extent(), &min_point) { + child.insert_rect(rect); + break; + } + } + } + } + } + + /// Check if this node is a leaf node. + pub fn is_leaf(&self) -> bool { + self.children.is_none() + } + + /// Get the leaf ID (only valid for leaf nodes). + pub fn leaf_id(&self) -> u32 { + assert!(self.is_leaf(), "leaf_id() called on non-leaf node"); + self.leaf_id + } + + /// Get the spatial extent of this node. + pub fn extent(&self) -> &Rect<f32> { + &self.extent + } + + /// Assign leaf IDs to all leaf nodes in the tree (breadth-first traversal). + pub fn assign_leaf_ids(&mut self) { + let mut next_id = 0; + self.assign_leaf_ids_recursive(&mut next_id); + } + + fn assign_leaf_ids_recursive(&mut self, next_id: &mut u32) { + if self.is_leaf() { + self.leaf_id = *next_id; + *next_id += 1; + } else if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.assign_leaf_ids_recursive(next_id); + } + } + } + + /// Find all leaf nodes that intersect with the given bounding box. + #[cfg(test)] + pub fn find_leaf_nodes(&self, rect: &Rect<f32>, matches: &mut Vec<KDBResult>) { + self.visit_intersecting_leaf_nodes(rect, &mut |kdb| { + matches.push(KDBResult::new(kdb)); + }); + } + + pub fn visit_intersecting_leaf_nodes<'a>( + &'a self, + rect: &Rect<f32>, + f: &mut impl FnMut(&'a KDBTree), + ) { + if !rects_intersect(&self.extent, rect) { + return; + } + + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_intersecting_leaf_nodes(rect, f); + } + } + } + + pub fn visit_leaf_nodes<'a>(&'a self, f: &mut impl FnMut(&'a KDBTree)) { + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_leaf_nodes(f); + } + } + } + + #[cfg(test)] + pub fn collect_leaf_nodes(&self) -> Vec<&KDBTree> { + let mut leaf_nodes = Vec::new(); + self.visit_leaf_nodes(&mut |kdb| { + leaf_nodes.push(kdb); + }); + leaf_nodes + } + + pub fn num_leaf_nodes(&self) -> usize { + let mut num = 0; + self.visit_leaf_nodes(&mut |_| { + num += 1; + }); + num + } + + /// Drop all stored items to save memory after tree construction. + pub fn drop_elements(&mut self) { + self.items.clear(); + if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.drop_elements(); + } + } + } + + /// Split this node along the specified axis. + /// Returns true if the split was successful, false otherwise. + fn split(&mut self, split_x: bool) -> bool { + // Sort items by the appropriate coordinate + if split_x { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } else { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } + + // Find the split coordinate from the middle item + let middle_idx = self.items.len() / 2; + let middle_min = self.items[middle_idx].min(); + + let split_coord = if split_x { middle_min.x } else { middle_min.y }; + + let extent_min = self.extent.min(); + let extent_max = self.extent.max(); + + // Check if split coordinate is valid + if split_x { + if split_coord <= extent_min.x || split_coord >= extent_max.x { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + } else if split_coord <= extent_min.y || split_coord >= extent_max.y { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + + // Create child extents + let (left_extent, right_extent) = if split_x { + let left = Rect::new( + extent_min, + Coord { + x: split_coord, + y: extent_max.y, + }, + ); + let right = Rect::new( + Coord { + x: split_coord, + y: extent_min.y, + }, + extent_max, + ); + (left, right) + } else { + let left = Rect::new( + extent_min, + Coord { + x: extent_max.x, + y: split_coord, + }, + ); + let right = Rect::new( + Coord { + x: extent_min.x, + y: split_coord, + }, + extent_max, + ); + (left, right) + }; + + // Create children + let mut left_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + left_extent, + ); + let mut right_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + right_extent, + ); + + // Distribute items to children + for item in self.items.drain(..) { + let belongs_to_left = if split_x { + item.min().x <= split_coord + } else { + item.min().y <= split_coord + }; + + if belongs_to_left { + left_child.insert_rect(item); + } else { + right_child.insert_rect(item); + } + } + + self.children = Some(Box::new([left_child, right_child])); + true + } +} + +/// KDB tree partitioner that implements the SpatialPartitioner trait. +/// +/// # Example Usage +/// +/// ```rust +/// use sedona_geometry::bounding_box::BoundingBox; +/// use sedona_spatial_join::partitioning::kdb::KDBPartitioner; +/// use sedona_spatial_join::partitioning::SpatialPartitioner; +/// +/// // Sample bounding boxes from your data +/// let bboxes = vec![ +/// BoundingBox::xy((0.0, 10.0), (0.0, 10.0)), +/// BoundingBox::xy((10.0, 20.0), (0.0, 10.0)), +/// BoundingBox::xy((20.0, 30.0), (0.0, 10.0)), +/// ]; +/// +/// // Build partitioner +/// let extent = BoundingBox::xy((0.0, 100.0), (0.0, 100.0)); +/// let partitioner = KDBPartitioner::build( +/// bboxes.into_iter(), +/// 10, // max_items_per_node +/// 4, // max_levels +/// extent +/// ).expect("failed to build KDB partitioner"); +/// +/// // Query partition for a new bbox +/// let query_bbox = BoundingBox::xy((5.0, 15.0), (5.0, 15.0)); +/// let partition = partitioner.partition(&query_bbox).unwrap(); +/// ``` +pub struct KDBPartitioner { + tree: Arc<KDBTree>, +} + +impl KDBPartitioner { + /// Create a new KDB partitioner from a KDB tree. + /// + /// The tree should already be built and have leaf IDs assigned. + pub(crate) fn new(tree: Arc<KDBTree>) -> Self { + KDBPartitioner { tree } + } + + /// Build a KDB tree from a collection of bounding boxes. + /// + /// # Arguments + /// * `bboxes` - Iterator of bounding boxes to partition + /// * `max_items_per_node` - Maximum items per node before splitting + /// * `max_levels` - Maximum tree depth + /// * `extent` - The spatial extent to partition + pub fn build( + bboxes: impl Iterator<Item = BoundingBox>, + max_items_per_node: usize, + max_levels: usize, + extent: BoundingBox, + ) -> Result<Self> { + let mut tree = KDBTree::new(max_items_per_node, max_levels, extent)?; + + for bbox in bboxes { + tree.insert(bbox)?; + } + + tree.assign_leaf_ids(); + tree.drop_elements(); + + Ok(Self::new(Arc::new(tree))) + } + + /// Get the number of leaf partitions in the tree. + pub fn num_partitions(&self) -> usize { + self.tree.num_leaf_nodes() + } +} + +impl SpatialPartitioner for KDBPartitioner { + fn num_regular_partitions(&self) -> usize { + self.num_partitions() + } + + fn partition(&self, bbox: &BoundingBox) -> Result<SpatialPartition> { + let rect = bbox_to_geo_rect(bbox)?; + let mut sp: Option<SpatialPartition> = None; + + self.tree.visit_intersecting_leaf_nodes(&rect, &mut |kdb| { + if sp.is_none() { + sp = Some(SpatialPartition::Regular(kdb.leaf_id())); + } else { + sp = Some(SpatialPartition::Multi) + } + }); + + Ok(sp.unwrap_or(SpatialPartition::None)) + } + + fn partition_no_multi(&self, bbox: &BoundingBox) -> Result<SpatialPartition> { + let rect = bbox_to_geo_rect(bbox)?; + + let mut best_leaf_id: Option<u32> = None; + let mut max_area = 0.0_f32; Review Comment: Above you use `let mut max_area = -1.0_f32` and just compare `area > max_area`. Is there a reason it's different here? ########## rust/sedona-spatial-join/src/partitioning/kdb.rs: ########## @@ -0,0 +1,843 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KDB-tree spatial partitioning implementation. +//! +//! This module provides a K-D-B tree (K-Dimensional B-tree) implementation for spatial +//! partitioning, which is essential for out-of-core spatial joins. +//! +//! # Partitioning Strategy +//! +//! - **Partition Assignment**: +//! - `partition()`: Returns `SpatialPartition::Regular(id)` if the bbox intersects exactly one partition, +//! `SpatialPartition::Multi` if it intersects multiple, or `SpatialPartition::None` if it intersects none. +//! - `partition_no_multi()`: Assigns the bbox to the partition with the largest intersection area +//! (Maximum Overlap Strategy), or `SpatialPartition::None` if no intersection. +//! +//! # Algorithm +//! +//! The KDB tree partitions space by recursively splitting it along alternating axes: +//! +//! 1. Start with the full spatial extent +//! 2. When a node exceeds `max_items_per_node`, split it: +//! - Choose the longer dimension (x or y) +//! - Sort items by their minimum coordinate on that dimension +//! - Split at the median item's coordinate +//! 3. Continue until reaching `max_levels` depth or items fit in nodes +//! 4. Assign sequential IDs to leaf nodes + +use std::sync::Arc; + +use crate::partitioning::{ + util::{bbox_to_geo_rect, rect_contains_point, rect_intersection_area, rects_intersect}, + SpatialPartition, SpatialPartitioner, +}; +use datafusion_common::Result; +use geo::{Coord, Rect}; +use sedona_geometry::bounding_box::BoundingBox; + +/// K-D-B tree spatial partitioner implementation. +/// +/// See https://en.wikipedia.org/wiki/K-D-B-tree +/// +/// The KDB tree is a hierarchical spatial data structure that recursively +/// partitions space using axis-aligned splits. It adapts to the spatial +/// distribution of data, making it effective for spatial partitioning. +/// +/// # Example +/// +/// ``` +/// use sedona_geometry::bounding_box::BoundingBox; +/// use sedona_spatial_join::partitioning::kdb::KDBPartitioner; +/// use sedona_spatial_join::partitioning::{SpatialPartitioner, SpatialPartition}; +/// +/// // Create sample bounding boxes +/// let bboxes = (0..20).map(|i| { +/// let x = (i % 10) as f64 * 10.0; +/// let y = (i / 10) as f64 * 10.0; +/// BoundingBox::xy((x, x + 5.0), (y, y + 5.0)) +/// }); +/// +/// // Build the KDB partitioner +/// let extent = BoundingBox::xy((0.0, 100.0), (0.0, 100.0)); +/// let partitioner = KDBPartitioner::build(bboxes, 5, 3, extent) +/// .expect("failed to build KDB partitioner"); +/// +/// // Query which partition a bounding box belongs to +/// let test_bbox = BoundingBox::xy((1.0, 4.0), (1.0, 4.0)); +/// match partitioner.partition(&test_bbox).unwrap() { +/// SpatialPartition::Regular(id) => println!("Partition ID: {}", id), +/// _ => unreachable!(), +/// } +/// ``` +pub(crate) struct KDBTree { + max_items_per_node: usize, + max_levels: usize, + extent: Rect<f32>, + level: usize, + items: Vec<Rect<f32>>, + children: Option<Box<[KDBTree; 2]>>, + leaf_id: u32, +} + +#[cfg(test)] +pub(crate) struct KDBResult { + extent: Rect<f32>, + leaf_id: u32, +} + +#[cfg(test)] +impl KDBResult { + fn new(kdb: &KDBTree) -> Self { + Self { + extent: kdb.extent, + leaf_id: kdb.leaf_id, + } + } +} + +impl KDBTree { + /// Create a new KDB tree with the given parameters. + /// + /// # Arguments + /// * `max_items_per_node` - Maximum number of items before splitting a node + /// * `max_levels` - Maximum depth of the tree + /// * `extent` - The spatial extent covered by this tree + pub fn new(max_items_per_node: usize, max_levels: usize, extent: BoundingBox) -> Result<Self> { + let extent_rect = bbox_to_geo_rect(&extent)?; + Ok(Self::new_with_level( + max_items_per_node, + max_levels, + 0, + extent_rect, + )) + } + + fn new_with_level( + max_items_per_node: usize, + max_levels: usize, + level: usize, + extent: Rect<f32>, + ) -> Self { + KDBTree { + max_items_per_node, + max_levels, + extent, + level, + items: Vec::new(), + children: None, + leaf_id: 0, + } + } + + /// Insert a bounding box into the tree. + pub fn insert(&mut self, bbox: BoundingBox) -> Result<()> { + let rect = bbox_to_geo_rect(&bbox)?; + self.insert_rect(rect); + Ok(()) + } + + fn insert_rect(&mut self, rect: Rect<f32>) { + if self.items.len() < self.max_items_per_node || self.level >= self.max_levels { + self.items.push(rect); + } else { + if self.children.is_none() { + // Split over longer side + let split_x = self.extent.width() > self.extent.height(); + let mut ok = self.split(split_x); + if !ok { + // Try splitting by the other side + ok = self.split(!split_x); + } + + if !ok { + // This could happen if all envelopes are the same. + self.items.push(rect); + return; + } + } + + // Insert into appropriate child + if let Some(ref mut children) = self.children { + let min_point = rect.min(); + for child in children.iter_mut() { + if rect_contains_point(child.extent(), &min_point) { + child.insert_rect(rect); + break; + } + } + } + } + } + + /// Check if this node is a leaf node. + pub fn is_leaf(&self) -> bool { + self.children.is_none() + } + + /// Get the leaf ID (only valid for leaf nodes). + pub fn leaf_id(&self) -> u32 { + assert!(self.is_leaf(), "leaf_id() called on non-leaf node"); + self.leaf_id + } + + /// Get the spatial extent of this node. + pub fn extent(&self) -> &Rect<f32> { + &self.extent + } + + /// Assign leaf IDs to all leaf nodes in the tree (breadth-first traversal). + pub fn assign_leaf_ids(&mut self) { + let mut next_id = 0; + self.assign_leaf_ids_recursive(&mut next_id); + } + + fn assign_leaf_ids_recursive(&mut self, next_id: &mut u32) { + if self.is_leaf() { + self.leaf_id = *next_id; + *next_id += 1; + } else if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.assign_leaf_ids_recursive(next_id); + } + } + } + + /// Find all leaf nodes that intersect with the given bounding box. + #[cfg(test)] + pub fn find_leaf_nodes(&self, rect: &Rect<f32>, matches: &mut Vec<KDBResult>) { + self.visit_intersecting_leaf_nodes(rect, &mut |kdb| { + matches.push(KDBResult::new(kdb)); + }); + } + + pub fn visit_intersecting_leaf_nodes<'a>( + &'a self, + rect: &Rect<f32>, + f: &mut impl FnMut(&'a KDBTree), + ) { + if !rects_intersect(&self.extent, rect) { + return; + } + + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_intersecting_leaf_nodes(rect, f); + } + } + } + + pub fn visit_leaf_nodes<'a>(&'a self, f: &mut impl FnMut(&'a KDBTree)) { + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_leaf_nodes(f); + } + } + } + + #[cfg(test)] + pub fn collect_leaf_nodes(&self) -> Vec<&KDBTree> { + let mut leaf_nodes = Vec::new(); + self.visit_leaf_nodes(&mut |kdb| { + leaf_nodes.push(kdb); + }); + leaf_nodes + } + + pub fn num_leaf_nodes(&self) -> usize { + let mut num = 0; + self.visit_leaf_nodes(&mut |_| { + num += 1; + }); + num + } + + /// Drop all stored items to save memory after tree construction. + pub fn drop_elements(&mut self) { + self.items.clear(); + if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.drop_elements(); + } + } + } + + /// Split this node along the specified axis. + /// Returns true if the split was successful, false otherwise. + fn split(&mut self, split_x: bool) -> bool { + // Sort items by the appropriate coordinate + if split_x { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } else { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } + + // Find the split coordinate from the middle item + let middle_idx = self.items.len() / 2; + let middle_min = self.items[middle_idx].min(); + + let split_coord = if split_x { middle_min.x } else { middle_min.y }; + + let extent_min = self.extent.min(); + let extent_max = self.extent.max(); + + // Check if split coordinate is valid + if split_x { + if split_coord <= extent_min.x || split_coord >= extent_max.x { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + } else if split_coord <= extent_min.y || split_coord >= extent_max.y { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + + // Create child extents + let (left_extent, right_extent) = if split_x { + let left = Rect::new( + extent_min, + Coord { + x: split_coord, + y: extent_max.y, + }, + ); + let right = Rect::new( + Coord { + x: split_coord, + y: extent_min.y, + }, + extent_max, + ); + (left, right) + } else { + let left = Rect::new( + extent_min, + Coord { + x: extent_max.x, + y: split_coord, + }, + ); + let right = Rect::new( + Coord { + x: extent_min.x, + y: split_coord, + }, + extent_max, + ); + (left, right) + }; + + // Create children + let mut left_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + left_extent, + ); + let mut right_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + right_extent, + ); + + // Distribute items to children + for item in self.items.drain(..) { + let belongs_to_left = if split_x { + item.min().x <= split_coord + } else { + item.min().y <= split_coord + }; + + if belongs_to_left { + left_child.insert_rect(item); + } else { + right_child.insert_rect(item); + } + } Review Comment: Not sure if this is better or not: ```suggestion if split_x { for item in self.items.drain(..) { if item.min().x <= split_coord { left_child.insert_rect(item); } else { right_child.insert_rect(item); } } } else { for item in self.items.drain(..) { if item.min().y <= split_coord { left_child.insert_rect(item); } else { right_child.insert_rect(item); } } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
