brunal commented on code in PR #9621:
URL: https://github.com/apache/arrow-rs/pull/9621#discussion_r3005326060


##########
arrow-ord/src/cmp.rs:
##########
@@ -373,16 +394,93 @@ fn apply<T: ArrayOrd>(
             Op::GreaterEqual => apply_op(l, l_s, r, r_s, true, T::is_lt),
         };
 
-        // If a side had a dictionary, and was not scalar, we need to 
materialize this
-        Some(match (l_v, r_v) {
-            (Some(l_v), _) if l_s.is_none() => take_bits(l_v, buffer),
-            (_, Some(r_v)) if r_s.is_none() => take_bits(r_v, buffer),
-            _ => buffer,
+        // Expand the physical-length result back to logical length.
+        // Find the non-scalar side that needs expansion (at most one).
+        let (dict, ree) = if l_s.is_none() {
+            (l_v, l_ree)
+        } else if r_s.is_none() {
+            (r_v, r_ree)
+        } else {
+            (None, None)
+        };
+        let buffer = match dict {
+            Some(d) => take_bits(d, buffer),
+            None => buffer,
+        };
+        Some(match ree {
+            Some(info) => expand_from_runs(info, buffer),
+            None => buffer,
         })
     }
 }
 
-/// Perform a take operation on `buffer` with the given dictionary
+/// Build a logical→physical index vector for one side of a non-scalar 
comparison.
+fn logical_indices(
+    len: usize,
+    dict: Option<&dyn AnyDictionaryArray>,
+    ree: Option<&ReeInfo>,
+) -> Vec<usize> {
+    match (dict, ree) {
+        (Some(d), Some(info)) => {
+            let keys = d.normalized_keys();
+            ree_physical_indices(info)
+                .iter()
+                .map(|&i| keys[i])
+                .collect()
+        }
+        (Some(d), None) => d.normalized_keys(),
+        (None, Some(info)) => ree_physical_indices(info),
+        (None, None) => (0..len).collect(),
+    }
+}
+
+fn ree_physical_indices(info: &ReeInfo) -> Vec<usize> {

Review Comment:
   you can turn this into a `.skip().map_while(...)` to return an 
`Iterator<Item = usize>`. Then in `logical_indices`'s Some-Some branch, you can 
directly map on it. This avoid materializing the intermediate physical indices 
vector.
   
   This is very nitpicky, as the one branch that benefits from this, 
`REE<dict>`, seems like quite a niche use case :)



##########
arrow-ord/src/cmp.rs:
##########
@@ -327,28 +336,41 @@ fn compare_op(op: Op, lhs: &dyn Datum, rhs: &dyn Datum) 
-> Result<BooleanArray,
 }
 
 /// Perform a potentially vectored `op` on the provided `ArrayOrd`
+#[allow(clippy::too_many_arguments)]
 fn apply<T: ArrayOrd>(
     op: Op,
     l: T,
     l_s: bool,

Review Comment:
   Ah, right! Still, I think that the too_many_arguments link is legit, and I'd 
consider groupin the args for each side -- either anonymous 3-tuple or a real 
struct. It would make the call site much cleaner. e.g.
   ```
   struct ArrayInfo<'a> {
     is_scalar: bool,
     dict_values: &'a dyn AnyDictionyArray,
     run_array: Option<&'a ReeInfo>,
   }
   ```



##########
arrow-ord/src/cmp.rs:
##########
@@ -373,16 +394,93 @@ fn apply<T: ArrayOrd>(
             Op::GreaterEqual => apply_op(l, l_s, r, r_s, true, T::is_lt),
         };
 
-        // If a side had a dictionary, and was not scalar, we need to 
materialize this
-        Some(match (l_v, r_v) {
-            (Some(l_v), _) if l_s.is_none() => take_bits(l_v, buffer),
-            (_, Some(r_v)) if r_s.is_none() => take_bits(r_v, buffer),
-            _ => buffer,
+        // Expand the physical-length result back to logical length.
+        // Find the non-scalar side that needs expansion (at most one).
+        let (dict, ree) = if l_s.is_none() {
+            (l_v, l_ree)
+        } else if r_s.is_none() {
+            (r_v, r_ree)
+        } else {
+            (None, None)
+        };
+        let buffer = match dict {
+            Some(d) => take_bits(d, buffer),
+            None => buffer,
+        };
+        Some(match ree {
+            Some(info) => expand_from_runs(info, buffer),
+            None => buffer,
         })
     }
 }
 
-/// Perform a take operation on `buffer` with the given dictionary
+/// Build a logical→physical index vector for one side of a non-scalar 
comparison.
+fn logical_indices(
+    len: usize,
+    dict: Option<&dyn AnyDictionaryArray>,
+    ree: Option<&ReeInfo>,
+) -> Vec<usize> {
+    match (dict, ree) {
+        (Some(d), Some(info)) => {
+            let keys = d.normalized_keys();
+            ree_physical_indices(info)
+                .iter()
+                .map(|&i| keys[i])
+                .collect()
+        }
+        (Some(d), None) => d.normalized_keys(),
+        (None, Some(info)) => ree_physical_indices(info),
+        (None, None) => (0..len).collect(),
+    }
+}
+
+fn ree_physical_indices(info: &ReeInfo) -> Vec<usize> {
+    let run_ends = info.run_ends_as_usize();
+    let end = info.offset + info.len;
+    let mut indices = Vec::with_capacity(info.len);
+    let mut pos = info.offset;
+    for (physical, &run_end) in 
run_ends.iter().enumerate().skip(info.start_physical) {
+        let run_end = run_end.min(end);
+        indices.extend(std::iter::repeat_n(physical, run_end - pos));
+        pos = run_end;
+        if pos >= end {
+            break;
+        }
+    }
+    indices
+}
+
+fn scalar_index(dict: Option<&dyn AnyDictionaryArray>, ree: Option<&ReeInfo>) 
-> usize {
+    let idx = ree.map(|r| r.start_physical).unwrap_or(0);

Review Comment:
   unwrap_or_default



##########
arrow-ord/src/cmp.rs:
##########
@@ -701,10 +840,48 @@ pub fn compare_byte_view<T: ByteViewType>(
     unsafe { GenericByteViewArray::compare_unchecked(left, left_idx, right, 
right_idx) }
 }
 
+/// Run-end encoding metadata for one side of a comparison. Holds a reference
+/// to the original REE array for deferred typed access to run_ends.
+struct ReeInfo<'a> {
+    array: &'a dyn Array,
+    offset: usize,
+    start_physical: usize,
+    len: usize,
+}
+
+impl ReeInfo<'_> {
+    /// Materialize run_ends as `Vec<usize>`.
+    fn run_ends_as_usize(&self) -> Vec<usize> {

Review Comment:
   I suspect you can entirely avoid materializing this Vec. Return an 
`Iterator<Item = usize>`.
   In the ree_physical_indices, using the iterator is trivial. In 
apply_op_segment, something like this might work
   
   ```
       let mut l_re = l_info.run_ends_as_usize();
       let mut r_re = r_info.run_ends_as_usize();
       let end = l_info.offset + l_info.len;
       let mut builder = BooleanBufferBuilder::new(l_info.len);
       let mut pos = l_info.offset;
   
       l_re.skip(l_info.start_physical);
       r_re.skip(l_info.start_hysical);
   
        // SAFETY: blablabla
       let mut l_end = l_re.next().unwrap()
       let mut r_end = r_re.next().unwrap();
   
       // Logic below doesn't work if info.end is 0 -- must skip it then
   
       loop {
           let seg_end = l_end.min(r_end);
   
           let result = unsafe { op(l.value_unchecked(l_phys), 
r.value_unchecked(r_phys)) } ^ neg;
           builder.append_n(seg_end - pos, result);
   
           pos = seg_end;
           if pos >= end { break }
   
           if pos >= l_end {
               l_end = l_re.next().unwrap();
           }
           if pos >= r_end {
               r_end = r_re.next().unwrap();
           }
       }
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to