HippoBaro commented on code in PR #9653:
URL: https://github.com/apache/arrow-rs/pull/9653#discussion_r3277197373
##########
parquet/src/arrow/arrow_writer/byte_array.rs:
##########
@@ -587,43 +710,111 @@ where
}
}
-/// Computes the min and max for the provided array and indices
-///
-/// This is a free function so it can be used with `downcast_op!`
-fn compute_min_max<T>(
- array: T,
- mut valid: impl Iterator<Item = usize>,
-) -> Option<(ByteArray, ByteArray)>
+/// Encodes a contiguous range from an offset-based byte array
+fn encode_dense<T>(
+ values: &GenericByteArray<T>,
+ offset: usize,
+ len: usize,
+ encoder: &mut ByteArrayEncoder,
+) where
+ T: ByteArrayType,
+{
+ if len == 0 {
+ return;
+ }
+
+ if encoder.statistics_enabled != EnabledStatistics::None {
+ update_statistics(encoder, dense_byte_values(values, offset, len));
+ }
+
+ if let Some(bloom_filter) = &mut encoder.bloom_filter {
+ update_bloom_filter(bloom_filter, dense_byte_values(values, offset,
len));
+ }
+
+ match &mut encoder.dict_encoder {
+ Some(dict_encoder) => dict_encoder.encode_dense(values, offset, len),
+ None => encoder.fallback.encode_dense(values, offset, len),
+ }
+}
+
+#[inline]
+fn byte_values<T, I>(values: T, indices: I) -> impl Iterator<Item = T::Item>
where
- T: ArrayAccessor,
- T::Item: Copy + Ord + AsRef<[u8]>,
+ T: ArrayAccessor + Copy,
+ I: Iterator<Item = usize>,
+{
+ indices.map(move |idx| values.value(idx))
+}
+
+#[inline]
+fn dense_byte_values<T>(
+ values: &GenericByteArray<T>,
+ offset: usize,
+ len: usize,
+) -> impl ExactSizeIterator<Item = &[u8]>
+where
+ T: ByteArrayType,
+{
+ let offsets = values.value_offsets();
+ let data = values.value_data();
+ (offset..offset + len).map(move |idx| dense_byte_value::<T>(offsets, data,
idx))
+}
+
+#[inline]
+fn update_statistics<T>(encoder: &mut ByteArrayEncoder, values: impl
Iterator<Item = T>)
+where
+ T: Copy + Ord + AsRef<[u8]>,
{
- let first_idx = valid.next()?;
+ if let Some(accumulator) = encoder.geo_stats_accumulator.as_mut() {
+ update_geo_stats_accumulator(accumulator.as_mut(), values);
+ } else if let Some((min, max)) = compute_min_max(values) {
+ if encoder.min_value.as_ref().is_none_or(|m| m > &min) {
+ encoder.min_value = Some(min);
+ }
- let first_val = array.value(first_idx);
+ if encoder.max_value.as_ref().is_none_or(|m| m < &max) {
+ encoder.max_value = Some(max);
+ }
+ }
+}
+
+#[inline]
+fn update_bloom_filter<T>(bloom_filter: &mut Sbbf, values: impl Iterator<Item
= T>)
+where
+ T: AsRef<[u8]>,
+{
+ for value in values {
+ bloom_filter.insert(value.as_ref());
+ }
+}
+
+/// Computes the min and max for the provided values
+#[inline]
Review Comment:
I haven't. For these small methods I tend to default to `#[inline]` out of
habit (and that seems to be common practice in this code base AFAICT.) I'll do
some benchmarking to test that.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]