[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3622: feat: take kernel for RunArray

via GitHub Sat, 04 Feb 2023 05:52:59 -0800


tustvold commented on code in PR #3622:
URL: https://github.com/apache/arrow-rs/pull/3622#discussion_r1096537688



##########
arrow-array/src/array/run_array.rs:
##########
@@ -143,6 +143,102 @@ impl<R: RunEndIndexType> RunArray<R> {
             values,
         })
     }
+
+    /// Returns index to the physical array for the given index to the logical 
array.
+    /// Performs a binary search on the run_ends array for the input index.
+    #[inline]
+    pub fn get_physical_index(&self, logical_index: usize) -> Option<usize> {
+        if logical_index >= self.len() {
+            return None;
+        }
+        let mut st: usize = 0;
+        let mut en: usize = self.run_ends().len();
+        while st + 1 < en {
+            let mid: usize = (st + en) / 2;
+            if logical_index
+                < unsafe {
+                    // Safety:
+                    // The value of mid will always be between 1 and len - 1,
+                    // where len is length of run ends array.
+                    // This is based on the fact that `st` starts with 0 and
+                    // `en` starts with len. The condition `st + 1 < en` 
ensures
+                    // `st` and `en` differs atleast by two. So the value of 
`mid`
+                    // will never be either `st` or `en`
+                    self.run_ends().value_unchecked(mid - 1).as_usize()
+                }
+            {
+                en = mid
+            } else {
+                st = mid
+            }
+        }
+        Some(st)
+    }
+
+    /// Returns the physical indices of the input logical indices. Returns 
error if any of the logical
+    /// index cannot be converted to physical index. The logical indices are 
sorted and iterated along
+    /// with run_ends array to find matching physical index. The approach used 
here was chosen over
+    /// finding physical index for each logical index using binary search 
using the function
+    /// `get_physical_index`. Running benchmarks on both approaches showed 
that the approach used here
+    /// scaled well for larger inputs.
+    /// See 
<https://github.com/apache/arrow-rs/pull/3622#issuecomment-1407753727> for more 
details.
+    #[inline]
+    pub fn get_physical_indices<I>(

Review Comment:
   This is very clever, I like it :+1:



##########
arrow-array/src/array/run_array.rs:
##########
@@ -718,14 +790,62 @@ mod tests {
         let run_array = builder.finish();
         let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
 
+        // Access every index and check if the value in the input array 
matches returned value.
         for (i, inp_val) in input_array.iter().enumerate() {
             if let Some(val) = inp_val {
                 let actual = typed.value(i);
                 assert_eq!(*val, actual)
             } else {
-                let physical_ix = typed.get_physical_index(i).unwrap();
+                let physical_ix = run_array.get_physical_index(i).unwrap();
                 assert!(typed.values().is_null(physical_ix));
             };
         }
     }
+
+    #[test]
+    fn test_get_physical_indices() {
+        let mut logical_len = 10;
+        // Test for logical lengths starting from 10 to 250 increasing by 10
+        while logical_len < 260 {
+            let input_array = build_input_array(logical_len);
+
+            // create run array using input_array
+            let mut builder = PrimitiveRunBuilder::<Int32Type, 
Int32Type>::new();
+            builder.extend(input_array.clone().into_iter());
+
+            let run_array = builder.finish();
+            let physical_values_array =
+                run_array.downcast::<Int32Array>().unwrap().values();

Review Comment:
   ```suggestion
               let physical_values_array = 
as_primitive_array::<Int32Type>(run_array.values());
   ```
   Perhaps?



##########
arrow-array/src/array/run_array.rs:
##########
@@ -718,14 +790,62 @@ mod tests {
         let run_array = builder.finish();
         let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
 
+        // Access every index and check if the value in the input array 
matches returned value.
         for (i, inp_val) in input_array.iter().enumerate() {
             if let Some(val) = inp_val {
                 let actual = typed.value(i);
                 assert_eq!(*val, actual)
             } else {
-                let physical_ix = typed.get_physical_index(i).unwrap();
+                let physical_ix = run_array.get_physical_index(i).unwrap();
                 assert!(typed.values().is_null(physical_ix));
             };
         }
     }
+
+    #[test]
+    fn test_get_physical_indices() {
+        let mut logical_len = 10;
+        // Test for logical lengths starting from 10 to 250 increasing by 10
+        while logical_len < 260 {
+            let input_array = build_input_array(logical_len);
+
+            // create run array using input_array
+            let mut builder = PrimitiveRunBuilder::<Int32Type, 
Int32Type>::new();
+            builder.extend(input_array.clone().into_iter());
+
+            let run_array = builder.finish();
+            let physical_values_array =
+                run_array.downcast::<Int32Array>().unwrap().values();
+
+            // create an array consisiting of all the indices shuffled.
+            let mut logical_indices: Vec<u32> = (0_u32..(logical_len as 
u32)).collect();

Review Comment:
   Could we possibly include the same index multiple times?



##########
arrow/src/util/bench_util.rs:
##########
@@ -145,6 +146,71 @@ pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
     data.iter().map(|x| x.as_deref()).collect()
 }
 
+/// Create primitive run array for given logical and physical array lengths
+pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
+    logical_array_len: usize,
+    physical_array_len: usize,
+) -> RunArray<R> {
+    // typical length of each run

Review Comment:
   ```suggestion
       assert!(logical_array_len > physical_array_len);
       // typical length of each run
   ```



##########
arrow-array/src/array/run_array.rs:
##########
@@ -718,14 +790,62 @@ mod tests {
         let run_array = builder.finish();
         let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
 
+        // Access every index and check if the value in the input array 
matches returned value.
         for (i, inp_val) in input_array.iter().enumerate() {
             if let Some(val) = inp_val {
                 let actual = typed.value(i);
                 assert_eq!(*val, actual)
             } else {
-                let physical_ix = typed.get_physical_index(i).unwrap();
+                let physical_ix = run_array.get_physical_index(i).unwrap();
                 assert!(typed.values().is_null(physical_ix));
             };
         }
     }
+
+    #[test]
+    fn test_get_physical_indices() {
+        let mut logical_len = 10;
+        // Test for logical lengths starting from 10 to 250 increasing by 10
+        while logical_len < 260 {

Review Comment:
   ```suggestion
           for logical_len in (0..250).step_by(10)
   ```



##########
arrow-array/src/array/run_array.rs:
##########
@@ -143,6 +143,102 @@ impl<R: RunEndIndexType> RunArray<R> {
             values,
         })
     }
+
+    /// Returns index to the physical array for the given index to the logical 
array.
+    /// Performs a binary search on the run_ends array for the input index.
+    #[inline]
+    pub fn get_physical_index(&self, logical_index: usize) -> Option<usize> {
+        if logical_index >= self.len() {
+            return None;
+        }
+        let mut st: usize = 0;
+        let mut en: usize = self.run_ends().len();
+        while st + 1 < en {
+            let mid: usize = (st + en) / 2;
+            if logical_index
+                < unsafe {
+                    // Safety:
+                    // The value of mid will always be between 1 and len - 1,
+                    // where len is length of run ends array.
+                    // This is based on the fact that `st` starts with 0 and
+                    // `en` starts with len. The condition `st + 1 < en` 
ensures
+                    // `st` and `en` differs atleast by two. So the value of 
`mid`
+                    // will never be either `st` or `en`
+                    self.run_ends().value_unchecked(mid - 1).as_usize()
+                }
+            {
+                en = mid
+            } else {
+                st = mid
+            }
+        }
+        Some(st)
+    }
+
+    /// Returns the physical indices of the input logical indices. Returns 
error if any of the logical
+    /// index cannot be converted to physical index. The logical indices are 
sorted and iterated along
+    /// with run_ends array to find matching physical index. The approach used 
here was chosen over
+    /// finding physical index for each logical index using binary search 
using the function
+    /// `get_physical_index`. Running benchmarks on both approaches showed 
that the approach used here
+    /// scaled well for larger inputs.
+    /// See 
<https://github.com/apache/arrow-rs/pull/3622#issuecomment-1407753727> for more 
details.
+    #[inline]
+    pub fn get_physical_indices<I>(
+        &self,
+        logical_indices: &[I],
+    ) -> Result<Vec<usize>, ArrowError>
+    where
+        I: ArrowNativeType,
+    {
+        let indices_len = logical_indices.len();
+
+        // `ordered_indices` store index into `logical_indices` and can be used
+        // to iterate `logical_indices` in sorted order.
+        let mut ordered_indices: Vec<usize> = (0..indices_len).collect();
+
+        // Instead of sorting `logical_idices` directly, sort the 
`ordered_indices`
+        // whose values are index of `logical_indices`
+        ordered_indices.sort_unstable_by(|lhs, rhs| {
+            logical_indices[*lhs]
+                .partial_cmp(&logical_indices[*rhs])
+                .unwrap()
+        });
+
+        let mut physical_indices = vec![0; indices_len];
+
+        let mut physical_index = 0_usize;
+        let mut ordered_index = 0_usize;
+        while physical_index < self.run_ends.len() && ordered_index < 
indices_len {

Review Comment:
   Could this be rewritten as
   
   ```
   for (physical_index, run_end) in self.run_ends.values().iter().enumerate() {
       let run_end_value = run_end.as_usize();
   ```



##########
arrow-array/src/array/run_array.rs:
##########
@@ -718,14 +790,62 @@ mod tests {
         let run_array = builder.finish();
         let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
 
+        // Access every index and check if the value in the input array 
matches returned value.
         for (i, inp_val) in input_array.iter().enumerate() {
             if let Some(val) = inp_val {
                 let actual = typed.value(i);
                 assert_eq!(*val, actual)
             } else {
-                let physical_ix = typed.get_physical_index(i).unwrap();
+                let physical_ix = run_array.get_physical_index(i).unwrap();
                 assert!(typed.values().is_null(physical_ix));
             };
         }
     }
+
+    #[test]
+    fn test_get_physical_indices() {
+        let mut logical_len = 10;
+        // Test for logical lengths starting from 10 to 250 increasing by 10
+        while logical_len < 260 {
+            let input_array = build_input_array(logical_len);
+
+            // create run array using input_array
+            let mut builder = PrimitiveRunBuilder::<Int32Type, 
Int32Type>::new();
+            builder.extend(input_array.clone().into_iter());
+
+            let run_array = builder.finish();
+            let physical_values_array =
+                run_array.downcast::<Int32Array>().unwrap().values();
+
+            // create an array consisiting of all the indices shuffled.
+            let mut logical_indices: Vec<u32> = (0_u32..(logical_len as 
u32)).collect();
+            let mut rng = thread_rng();
+            logical_indices.shuffle(&mut rng);
+
+            let physical_indices =
+                run_array.get_physical_indices(&logical_indices).unwrap();
+
+            assert_eq!(logical_indices.len(), physical_indices.len());
+
+            // check value in logical index in the input_array matches 
physical index in typed_run_array
+            logical_indices
+                .iter()
+                .map(|f| f.as_usize())
+                .zip(physical_indices.iter())
+                .for_each(|(logical_ix, physical_ix)| {
+                    let expected = input_array[logical_ix];
+                    match expected {
+                        Some(val) => {
+                            let actual = 
physical_values_array.value(*physical_ix);
+                            assert_eq!(val, actual);

Review Comment:
   ```suggestion
                               
assert!(physical_values_array.is_valid(*physical_ix))
                               assert_eq!(val, actual);
   ```



##########
arrow-array/src/array/run_array.rs:
##########
@@ -718,14 +790,62 @@ mod tests {
         let run_array = builder.finish();
         let typed = run_array.downcast::<PrimitiveArray<Int32Type>>().unwrap();
 
+        // Access every index and check if the value in the input array 
matches returned value.
         for (i, inp_val) in input_array.iter().enumerate() {
             if let Some(val) = inp_val {
                 let actual = typed.value(i);
                 assert_eq!(*val, actual)
             } else {
-                let physical_ix = typed.get_physical_index(i).unwrap();
+                let physical_ix = run_array.get_physical_index(i).unwrap();
                 assert!(typed.values().is_null(physical_ix));
             };
         }
     }
+
+    #[test]
+    fn test_get_physical_indices() {
+        let mut logical_len = 10;
+        // Test for logical lengths starting from 10 to 250 increasing by 10
+        while logical_len < 260 {
+            let input_array = build_input_array(logical_len);
+
+            // create run array using input_array
+            let mut builder = PrimitiveRunBuilder::<Int32Type, 
Int32Type>::new();
+            builder.extend(input_array.clone().into_iter());
+
+            let run_array = builder.finish();
+            let physical_values_array =
+                run_array.downcast::<Int32Array>().unwrap().values();
+
+            // create an array consisiting of all the indices shuffled.

Review Comment:
   ```suggestion
               // create an array consisting of all the indices shuffled.
   ```



##########
arrow-array/src/cast.rs:
##########
@@ -95,6 +95,53 @@ macro_rules! downcast_integer {
     };
 }
 
+/// Given one or more expressions evaluating to an integer [`DataType`] 
invokes the provided macro

Review Comment:
   :+1:



##########
arrow/src/util/bench_util.rs:
##########
@@ -145,6 +146,71 @@ pub fn create_string_dict_array<K: ArrowDictionaryKeyType>(
     data.iter().map(|x| x.as_deref()).collect()
 }
 
+/// Create primitive run array for given logical and physical array lengths
+pub fn create_primitive_run_array<R: RunEndIndexType, V: ArrowPrimitiveType>(
+    logical_array_len: usize,
+    physical_array_len: usize,
+) -> RunArray<R> {
+    // typical length of each run
+    let run_len = logical_array_len / physical_array_len;
+
+    // Some runs should have extra length
+    let mut run_len_extra = logical_array_len % physical_array_len;
+
+    let mut values: Vec<V::Native> = (0..physical_array_len)
+        .flat_map(|s| {
+            let mut take_len = run_len;
+            if run_len_extra > 0 {
+                take_len += 1;
+                run_len_extra -= 1;
+            }
+            std::iter::repeat(V::Native::from_usize(s).unwrap()).take(take_len)
+        })
+        .collect();
+    while values.len() < logical_array_len {
+        let last_val = values[values.len() - 1];
+        values.push(last_val);
+    }
+    let mut builder = PrimitiveRunBuilder::<R, 
V>::with_capacity(physical_array_len);
+    builder.extend(values.into_iter().map(Some));
+
+    builder.finish()
+}
+
+/// Create string array to be used by run array builder. The string array
+/// will result in run array with physial length of `physical_array_len`
+/// and logical length of `logical_array_len`
+pub fn create_string_array_for_runs(
+    physical_array_len: usize,
+    logical_array_len: usize,
+    string_len: usize,
+) -> Vec<String> {
+    let mut rng = thread_rng();

Review Comment:
   ```suggestion
       assert!(logical_array_len > physical_array_len);
       let mut rng = thread_rng();
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3622: feat: take kernel for RunArray

Reply via email to