Copilot commented on code in PR #514:
URL: https://github.com/apache/sedona-db/pull/514#discussion_r2690933176


##########
rust/sedona-spatial-join/src/utils/join_utils.rs:
##########
@@ -355,25 +359,45 @@ pub(crate) fn get_semi_indices<T: ArrowPrimitiveType>(
 where
     NativeAdapter<T>: From<<T as ArrowPrimitiveType>::Native>,
 {
-    let mut bitmap = BooleanBufferBuilder::new(range.len());
-    bitmap.append_n(range.len(), false);
-    input_indices
-        .iter()
-        .flatten()
-        .map(|v| v.as_usize())
-        .filter(|v| range.contains(v))
-        .for_each(|v| {
-            bitmap.set_bit(v - range.start, true);
-        });
-
+    let bitmap = build_range_bitmap(&range, input_indices);
     let offset = range.start;
-
     // get the semi index
     (range)
         .filter_map(|idx| (bitmap.get_bit(idx - 
offset)).then_some(T::Native::from_usize(idx)))
         .collect()
 }
 
+pub(crate) fn get_mark_indices<T: ArrowPrimitiveType, R: ArrowPrimitiveType>(
+    range: &Range<usize>,
+    input_indices: &PrimitiveArray<T>,
+) -> PrimitiveArray<R>
+where
+    NativeAdapter<T>: From<<T as ArrowPrimitiveType>::Native>,
+{
+    let mut bitmap = build_range_bitmap(range, input_indices);

Review Comment:
   The `build_range_bitmap` function returns a `BooleanBufferBuilder`, but it's 
being assigned to a mutable variable with `mut`. Since `build_range_bitmap` 
already returns a builder that can be finished, the `mut` keyword is necessary 
here. However, this appears correct as-is; no issue detected on second review.



##########
rust/sedona-spatial-join/src/utils/join_utils.rs:
##########
@@ -355,25 +359,45 @@ pub(crate) fn get_semi_indices<T: ArrowPrimitiveType>(
 where
     NativeAdapter<T>: From<<T as ArrowPrimitiveType>::Native>,
 {
-    let mut bitmap = BooleanBufferBuilder::new(range.len());
-    bitmap.append_n(range.len(), false);
-    input_indices
-        .iter()
-        .flatten()
-        .map(|v| v.as_usize())
-        .filter(|v| range.contains(v))
-        .for_each(|v| {
-            bitmap.set_bit(v - range.start, true);
-        });
-
+    let bitmap = build_range_bitmap(&range, input_indices);
     let offset = range.start;
-
     // get the semi index
     (range)
         .filter_map(|idx| (bitmap.get_bit(idx - 
offset)).then_some(T::Native::from_usize(idx)))
         .collect()
 }
 
+pub(crate) fn get_mark_indices<T: ArrowPrimitiveType, R: ArrowPrimitiveType>(
+    range: &Range<usize>,
+    input_indices: &PrimitiveArray<T>,
+) -> PrimitiveArray<R>
+where
+    NativeAdapter<T>: From<<T as ArrowPrimitiveType>::Native>,
+{
+    let mut bitmap = build_range_bitmap(range, input_indices);
+    PrimitiveArray::new(
+        vec![R::Native::default(); range.len()].into(),

Review Comment:
   Creating a vector filled with default values and then converting it to an 
array buffer may be inefficient for large ranges. Consider using a more 
efficient buffer creation method or pre-allocating with the exact capacity 
needed.



##########
rust/sedona-spatial-join/src/exec.rs:
##########
@@ -1075,10 +1085,10 @@ mod tests {
             JoinType::Left => "SELECT L.id l_id, R.id r_id FROM L LEFT JOIN R 
ON ST_Intersects(L.geometry, R.geometry) ORDER BY l_id, r_id",
             JoinType::Right => "SELECT L.id l_id, R.id r_id FROM L RIGHT JOIN 
R ON ST_Intersects(L.geometry, R.geometry) ORDER BY l_id, r_id",
             JoinType::Full => "SELECT L.id l_id, R.id r_id FROM L FULL OUTER 
JOIN R ON ST_Intersects(L.geometry, R.geometry) ORDER BY l_id, r_id",
-            JoinType::LeftSemi => "SELECT L.id l_id FROM L WHERE EXISTS 
(SELECT 1 FROM R WHERE ST_Intersects(L.geometry, R.geometry)) ORDER BY l_id",
-            JoinType::RightSemi => "SELECT R.id r_id FROM R WHERE EXISTS 
(SELECT 1 FROM L WHERE ST_Intersects(L.geometry, R.geometry)) ORDER BY r_id",
-            JoinType::LeftAnti => "SELECT L.id l_id FROM L WHERE NOT EXISTS 
(SELECT 1 FROM R WHERE ST_Intersects(L.geometry, R.geometry)) ORDER BY l_id",
-            JoinType::RightAnti => "SELECT R.id r_id FROM R WHERE NOT EXISTS 
(SELECT 1 FROM L WHERE ST_Intersects(L.geometry, R.geometry)) ORDER BY r_id",
+            JoinType::LeftSemi => "SELECT L.id l_id FROM L LEFT SEMI JOIN R ON 
ST_Intersects(L.geometry, R.geometry) ORDER BY l_id",
+            JoinType::RightSemi => "SELECT R.id r_id FROM L RIGHT SEMI JOIN R 
ON ST_Intersects(L.geometry, R.geometry) ORDER BY r_id",
+            JoinType::LeftAnti => "SELECT L.id l_id FROM L LEFT ANTI JOIN R ON 
ST_Intersects(L.geometry, R.geometry) ORDER BY l_id",
+            JoinType::RightAnti => "SELECT R.id r_id FROM L RIGHT ANTI JOIN R 
ON ST_Intersects(L.geometry, R.geometry) ORDER BY r_id",

Review Comment:
   The SQL queries have been changed from `WHERE EXISTS` / `WHERE NOT EXISTS` 
subquery patterns to `LEFT SEMI JOIN` / `LEFT ANTI JOIN` / `RIGHT SEMI JOIN` / 
`RIGHT ANTI JOIN` syntax. While both are semantically equivalent, ensure that 
the DataFusion SQL parser supports this JOIN syntax for semi and anti joins, as 
the subquery pattern is more universally supported.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to