zhangfengcdt commented on code in PR #169:
URL: https://github.com/apache/sedona-db/pull/169#discussion_r2410706134
##########
rust/sedona-spatial-join/src/index.rs:
##########
@@ -937,6 +903,112 @@ async fn collect_build_partition(
/// Rough estimate for in-memory size of the rtree per rect in bytes
const RTREE_MEMORY_ESTIMATE_PER_RECT: usize = 60;
+/// Shared KNN components that can be reused across queries
+struct KnnComponents {
+ euclidean_metric: EuclideanDistance,
+ haversine_metric: HaversineDistance,
+ /// Pre-allocated vector for geometry cache - lock-free access
+ /// Indexed by rtree data index for O(1) access
+ geometry_cache: Vec<OnceCell<Geometry<f64>>>,
+ /// Memory reservation to track geometry cache memory usage
+ _reservation: MemoryReservation,
+}
+
+impl KnnComponents {
+ fn new(
+ cache_size: usize,
+ indexed_batches: &[IndexedBatch],
+ memory_pool: Arc<dyn MemoryPool>,
+ ) -> datafusion_common::Result<Self> {
+ // Create memory consumer and reservation for geometry cache
+ let consumer = MemoryConsumer::new("SpatialJoinKnnGeometryCache");
+ let mut reservation = consumer.register(&memory_pool);
+
+ // Estimate maximum possible memory usage based on WKB sizes
+ let estimated_memory =
Self::estimate_max_memory_usage(indexed_batches);
+ reservation.try_grow(estimated_memory)?;
+
+ // Pre-allocate OnceCell vector
+ let geometry_cache = (0..cache_size).map(|_|
OnceCell::new()).collect();
+
+ Ok(Self {
+ euclidean_metric: EuclideanDistance,
+ haversine_metric: HaversineDistance::default(),
+ geometry_cache,
+ _reservation: reservation,
+ })
+ }
+
+ /// Estimate the maximum memory usage for decoded geometries based on WKB
sizes
+ fn estimate_max_memory_usage(indexed_batches: &[IndexedBatch]) -> usize {
+ let mut total_wkb_size = 0;
+
+ for batch in indexed_batches {
+ for wkb in batch.geom_array.wkbs().iter().flatten() {
+ total_wkb_size += wkb.buf().len();
+ }
+ }
+ total_wkb_size
+ }
+}
+
+/// Geometry accessor for SedonaDB KNN queries.
+/// This accessor provides on-demand WKB decoding and geometry caching for
efficient
+/// KNN queries with support for both Euclidean and Haversine distance metrics.
+struct SedonaKnnAdapter<'a> {
+ indexed_batches: &'a [IndexedBatch],
+ data_id_to_batch_pos: &'a [(i32, i32)],
+ // Reference to KNN components for cache and memory tracking
+ knn_components: &'a KnnComponents,
+}
+
+impl<'a> SedonaKnnAdapter<'a> {
+ /// Create a new adapter
+ fn new(
+ indexed_batches: &'a [IndexedBatch],
+ data_id_to_batch_pos: &'a [(i32, i32)],
+ knn_components: &'a KnnComponents,
+ ) -> Self {
+ Self {
+ indexed_batches,
+ data_id_to_batch_pos,
+ knn_components,
+ }
+ }
+}
+
+impl<'a> GeometryAccessor for SedonaKnnAdapter<'a> {
+ /// Get geometry for the given item index with lock-free caching
+ fn get_geometry(&self, item_index: usize) -> Option<Geometry<f64>> {
Review Comment:
Yes, I will need to change the trait definition in geo-index first before
changing this to return reference.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]