nevi-me commented on a change in pull request #8170:
URL: https://github.com/apache/arrow/pull/8170#discussion_r489609845



##########
File path: rust/arrow/src/compute/kernels/take.rs
##########
@@ -166,42 +166,124 @@ fn take_primitive<T>(values: &ArrayRef, indices: 
&UInt32Array) -> Result<ArrayRe
 where
     T: ArrowPrimitiveType,
 {
-    let mut builder = PrimitiveBuilder::<T>::new(indices.len());
-    let a = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
-    for i in 0..indices.len() {
-        if indices.is_null(i) {
-            // populate with null if index is null
-            builder.append_null()?;
-        } else {
-            // get index value to use in looking up the value from `values`
-            let ix = indices.value(i) as usize;
-            if a.is_valid(ix) {
-                builder.append_value(a.value(ix))?;
-            } else {
-                builder.append_null()?;
+    let data_len = indices.len();
+
+    let array = values.as_any().downcast_ref::<PrimitiveArray<T>>().unwrap();
+
+    let num_bytes = bit_util::ceil(data_len, 8);
+    let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, 
false);
+
+    let null_slice = null_buf.data_mut();
+
+    let new_values: Vec<T::Native> = (0..data_len)
+        .map(|i| {
+            let index = indices.value(i) as usize;
+            if array.is_valid(index) {
+                bit_util::set_bit(null_slice, i);
+            }
+            array.value(index)
+        })
+        .collect();
+
+    let nulls = match indices.data_ref().null_buffer() {
+        Some(buffer) => buffer_bin_and(buffer, 0, &null_buf.freeze(), 0, 
indices.len()),
+        None => null_buf.freeze(),
+    };
+
+    let data = ArrayData::new(
+        T::get_data_type(),
+        indices.len(),
+        None,
+        Some(nulls),
+        0,
+        vec![Buffer::from(new_values.to_byte_slice())],
+        vec![],
+    );
+    Ok(Arc::new(PrimitiveArray::<T>::from(Arc::new(data))))
+}
+
+/// `take` implementation for boolean arrays
+fn take_boolean(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
+    let data_len = indices.len();
+
+    let array = values.as_any().downcast_ref::<BooleanArray>().unwrap();
+
+    let num_byte = bit_util::ceil(data_len, 8);
+    let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, 
false);
+    let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, 
false);
+
+    let null_slice = null_buf.data_mut();
+    let val_slice = val_buf.data_mut();
+
+    (0..data_len).for_each(|i| {
+        let index = indices.value(i) as usize;
+        if array.is_valid(index) {
+            bit_util::set_bit(null_slice, i);
+            if array.value(index) {
+                bit_util::set_bit(val_slice, i);
             }
         }
-    }
-    Ok(Arc::new(builder.finish()) as ArrayRef)
+    });
+
+    let nulls = match indices.data_ref().null_buffer() {
+        Some(buffer) => buffer_bin_and(buffer, 0, &null_buf.freeze(), 0, 
indices.len()),
+        None => null_buf.freeze(),
+    };
+
+    let data = ArrayData::new(
+        DataType::Boolean,
+        indices.len(),
+        None,
+        Some(nulls),
+        0,
+        vec![val_buf.freeze()],
+        vec![],
+    );
+    Ok(Arc::new(BooleanArray::from(Arc::new(data))))
 }
 
 /// `take` implementation for string arrays
 fn take_string(values: &ArrayRef, indices: &UInt32Array) -> Result<ArrayRef> {
-    let mut builder = StringBuilder::new(indices.len());
-    let a = values.as_any().downcast_ref::<StringArray>().unwrap();
-    for i in 0..indices.len() {
-        if indices.is_null(i) {
-            builder.append(false)?;
-        } else {
-            let ix = indices.value(i) as usize;
-            if a.is_null(ix) {
-                builder.append(false)?;
-            } else {
-                builder.append_value(a.value(ix))?;
-            }
-        }
+    let data_len = indices.len();
+
+    let array = values.as_any().downcast_ref::<StringArray>().unwrap();
+
+    let num_bytes = bit_util::ceil(data_len, 8);
+    let mut null_buf = MutableBuffer::new(num_bytes).with_bitset(num_bytes, 
false);
+    let null_slice = null_buf.data_mut();
+
+    let mut offsets = Vec::with_capacity(data_len + 1);
+    let mut values = Vec::with_capacity(data_len);

Review comment:
       We'll never know the exact value AOT, but I'm wondering if allocating to 
n items is beneficial. Looks like we'd have to find a balance between under and 
over-allocation.
   
   What if we say (byte-len / valid-slots) * index-valid-slots? If the string 
lengths are uniform-ish, we would allocate a vec close enough to the actual.
   
   If we have an input array with 1000 values, and say 800 are valid; and we're 
only taking 200 values out of 250 total. If the length of the string data is 
8000 (avg 10 bytes per slot), then we would allocate 2000 bytes, instead of the 
current 250. The eventual array will definitely be bigger than 250, and 
somewhere close to 2000.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to