Kontinuation commented on code in PR #645:
URL: https://github.com/apache/sedona-db/pull/645#discussion_r2884208800


##########
rust/sedona-spatial-join/src/index/build_side_collector.rs:
##########
@@ -217,7 +218,7 @@ impl BuildSideBatchesCollector {
         }
 
         let geo_statistics = analyzer.finish();
-        let extra_mem = SpatialIndexBuilder::estimate_extra_memory_usage(
+        let extra_mem = 
DefaultSpatialIndexBuilder::estimate_extra_memory_usage(

Review Comment:
   Do we need to pass in an instance of SpatialIndexBuilder into the build side 
collector to make this call a dynamic dispatch?



##########
rust/sedona-spatial-join/src/index/spatial_index_builder.rs:
##########
@@ -15,59 +15,42 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::array::BooleanBufferBuilder;
-use arrow_schema::SchemaRef;
 use datafusion_physical_plan::metrics::{self, ExecutionPlanMetricsSet, 
MetricBuilder};
 use sedona_common::SpatialJoinOptions;
 use sedona_expr::statistics::GeoStatistics;
 
-use datafusion_common::{utils::proxy::VecAllocExt, Result};
-use datafusion_expr::JoinType;
-use futures::StreamExt;
-use geo_index::rtree::{sort::HilbertSort, RTree, RTreeBuilder, RTreeIndex};
-use parking_lot::Mutex;
-use std::sync::atomic::AtomicUsize;
-
+use crate::index::spatial_index::SpatialIndexRef;
 use crate::{
     evaluated_batch::{evaluated_batch_stream::SendableEvaluatedBatchStream, 
EvaluatedBatch},
-    index::{knn_adapter::KnnComponents, spatial_index::SpatialIndex},
-    operand_evaluator::create_operand_evaluator,
-    refine::create_refiner,
     spatial_predicate::SpatialPredicate,
-    utils::join_utils::need_produce_result_in_final,
 };
-
-// Type aliases for better readability
-type SpatialRTree = RTree<f32>;
-type DataIdToBatchPos = Vec<(i32, i32)>;
-type RTreeBuildResult = (SpatialRTree, DataIdToBatchPos);
-
-/// Rough estimate for in-memory size of the rtree per rect in bytes
-const RTREE_MEMORY_ESTIMATE_PER_RECT: usize = 60;
+use async_trait::async_trait;
+use datafusion_common::Result;
 
 /// Builder for constructing a SpatialIndex from geometry batches.
-///
-/// This builder handles:
-/// 1. Accumulating geometry batches to be indexed
-/// 2. Building the spatial R-tree index
-/// 3. Setting up memory tracking and visited bitmaps
-/// 4. Configuring prepared geometries based on execution mode
-pub struct SpatialIndexBuilder {
-    schema: SchemaRef,
-    spatial_predicate: SpatialPredicate,
-    options: SpatialJoinOptions,
-    join_type: JoinType,
-    probe_threads_count: usize,
-    metrics: SpatialJoinBuildMetrics,
-
-    /// Batches to be indexed
-    indexed_batches: Vec<EvaluatedBatch>,
-
-    /// Statistics for indexed geometries
-    stats: GeoStatistics,
-
-    /// Memory used by the spatial index
-    memory_used: usize,
+#[async_trait]
+pub(crate) trait SpatialIndexBuilder {
+    /// Estimate the amount of memory required by the R-tree index and 
evaluating spatial predicates.
+    /// The estimated memory usage does not include the memory required for 
holding the build side
+    /// batches.
+    fn estimate_extra_memory_usage(
+        geo_stats: &GeoStatistics,
+        spatial_predicate: &SpatialPredicate,
+        options: &SpatialJoinOptions,
+    ) -> usize;
+    /// Add a geometry batch to be indexed.
+    /// This method accumulates geometry batches that will be used to build 
the spatial index.
+    /// Each batch contains processed geometry data along with memory usage 
information.
+    fn add_batch(&mut self, indexed_batch: EvaluatedBatch) -> Result<()>;
+    /// Merge the provided GeoStatistics with the statistics of the batches 
added so far.
+    fn merge_stats(&mut self, stats: GeoStatistics) -> &mut Self;
+    /// Finish building and return the completed SpatialIndex.
+    fn finish(self) -> Result<SpatialIndexRef>;
+    async fn add_stream(
+        &mut self,
+        stream: SendableEvaluatedBatchStream,
+        geo_statistics: GeoStatistics,
+    ) -> Result<()>;

Review Comment:
   I believe that we can simplify this trait by removing `add_batch` and 
`merge_stats`, and only leave `add_stream` in this trait. We can do this 
refactoring in later PRs.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to