Copilot commented on code in PR #169:
URL: https://github.com/apache/sedona-db/pull/169#discussion_r2397360544


##########
rust/sedona-spatial-join/src/index.rs:
##########
@@ -937,6 +903,109 @@ async fn collect_build_partition(
 /// Rough estimate for in-memory size of the rtree per rect in bytes
 const RTREE_MEMORY_ESTIMATE_PER_RECT: usize = 60;
 
+/// Shared KNN components that can be reused across queries
+struct KnnComponents {
+    euclidean_metric: EuclideanDistance,
+    haversine_metric: HaversineDistance,
+    /// Pre-allocated vector for geometry cache - lock-free access
+    /// Indexed by rtree data index for O(1) access
+    geometry_cache: Vec<OnceCell<Geometry<f64>>>,
+}
+
+impl KnnComponents {
+    fn new(
+        cache_size: usize,
+        indexed_batches: &[IndexedBatch],
+        memory_pool: Arc<dyn MemoryPool>,
+    ) -> datafusion_common::Result<Self> {
+        // Create memory consumer and reservation for geometry cache
+        let consumer = MemoryConsumer::new("SpatialJoinKnnGeometryCache");
+        let mut memory_reservation = consumer.register(&memory_pool);
+
+        // Estimate maximum possible memory usage based on WKB sizes
+        let estimated_memory = 
Self::estimate_max_memory_usage(indexed_batches);
+        memory_reservation.try_grow(estimated_memory)?;

Review Comment:
   Memory reservation is grown but never used or tracked. The 
`memory_reservation` variable is created and grown but then dropped without 
being stored in the struct, leading to immediate memory deallocation.



##########
rust/sedona-spatial-join/src/index.rs:
##########
@@ -937,6 +903,109 @@ async fn collect_build_partition(
 /// Rough estimate for in-memory size of the rtree per rect in bytes
 const RTREE_MEMORY_ESTIMATE_PER_RECT: usize = 60;
 
+/// Shared KNN components that can be reused across queries
+struct KnnComponents {
+    euclidean_metric: EuclideanDistance,
+    haversine_metric: HaversineDistance,
+    /// Pre-allocated vector for geometry cache - lock-free access
+    /// Indexed by rtree data index for O(1) access
+    geometry_cache: Vec<OnceCell<Geometry<f64>>>,
+}
+
+impl KnnComponents {
+    fn new(
+        cache_size: usize,
+        indexed_batches: &[IndexedBatch],
+        memory_pool: Arc<dyn MemoryPool>,
+    ) -> datafusion_common::Result<Self> {
+        // Create memory consumer and reservation for geometry cache
+        let consumer = MemoryConsumer::new("SpatialJoinKnnGeometryCache");
+        let mut memory_reservation = consumer.register(&memory_pool);
+
+        // Estimate maximum possible memory usage based on WKB sizes
+        let estimated_memory = 
Self::estimate_max_memory_usage(indexed_batches);
+        memory_reservation.try_grow(estimated_memory)?;
+
+        // Pre-allocate OnceCell vector
+        let geometry_cache = (0..cache_size).map(|_| 
OnceCell::new()).collect();
+
+        Ok(Self {
+            euclidean_metric: EuclideanDistance,
+            haversine_metric: HaversineDistance::default(),
+            geometry_cache,
+        })
+    }
+
+    /// Estimate the maximum memory usage for decoded geometries based on WKB 
sizes
+    fn estimate_max_memory_usage(indexed_batches: &[IndexedBatch]) -> usize {
+        let mut total_wkb_size = 0;
+
+        for batch in indexed_batches {
+            for wkb in batch.geom_array.wkbs().iter().flatten() {
+                total_wkb_size += wkb.buf().len();
+            }
+        }
+        total_wkb_size

Review Comment:
   The memory estimation only considers WKB size but doesn't account for the 
overhead of `OnceCell` containers and decoded `Geometry` objects, which could 
lead to underestimation of actual memory usage.



##########
.github/workflows/rust.yml:
##########
@@ -55,11 +55,14 @@ jobs:
     runs-on: ubuntu-latest
     env:
       CARGO_INCREMENTAL: 0
-      # Reduce debug info to save disk space
-      CARGO_PROFILE_DEV_DEBUG: 1
-      CARGO_PROFILE_TEST_DEBUG: 1
+      # Disable debug info completely to save disk space
+      CARGO_PROFILE_DEV_DEBUG: 0
+      CARGO_PROFILE_TEST_DEBUG: 0
       # Limit parallel compilation to reduce memory pressure
       CARGO_BUILD_JOBS: 2
+      # Strip symbols to reduce binary size
+      CARGO_PROFILE_DEV_STRIP: symbols
+      CARGO_PROFILE_TEST_STRIP: symbols

Review Comment:
   These environment variables are not valid Cargo configuration. The correct 
way to strip symbols is through `Cargo.toml` profiles or using `--config` flag, 
not environment variables.
   ```suggestion
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to