alamb commented on code in PR #9653:
URL: https://github.com/apache/arrow-rs/pull/9653#discussion_r3277136023
##########
parquet/src/arrow/arrow_writer/byte_array.rs:
##########
@@ -587,43 +710,111 @@ where
}
}
-/// Computes the min and max for the provided array and indices
-///
-/// This is a free function so it can be used with `downcast_op!`
-fn compute_min_max<T>(
- array: T,
- mut valid: impl Iterator<Item = usize>,
-) -> Option<(ByteArray, ByteArray)>
+/// Encodes a contiguous range from an offset-based byte array
+fn encode_dense<T>(
+ values: &GenericByteArray<T>,
+ offset: usize,
+ len: usize,
+ encoder: &mut ByteArrayEncoder,
+) where
+ T: ByteArrayType,
+{
+ if len == 0 {
+ return;
+ }
+
+ if encoder.statistics_enabled != EnabledStatistics::None {
+ update_statistics(encoder, dense_byte_values(values, offset, len));
+ }
+
+ if let Some(bloom_filter) = &mut encoder.bloom_filter {
+ update_bloom_filter(bloom_filter, dense_byte_values(values, offset,
len));
+ }
+
+ match &mut encoder.dict_encoder {
+ Some(dict_encoder) => dict_encoder.encode_dense(values, offset, len),
+ None => encoder.fallback.encode_dense(values, offset, len),
+ }
+}
+
+#[inline]
+fn byte_values<T, I>(values: T, indices: I) -> impl Iterator<Item = T::Item>
where
- T: ArrayAccessor,
- T::Item: Copy + Ord + AsRef<[u8]>,
+ T: ArrayAccessor + Copy,
+ I: Iterator<Item = usize>,
+{
+ indices.map(move |idx| values.value(idx))
+}
+
+#[inline]
+fn dense_byte_values<T>(
+ values: &GenericByteArray<T>,
+ offset: usize,
+ len: usize,
+) -> impl ExactSizeIterator<Item = &[u8]>
+where
+ T: ByteArrayType,
+{
+ let offsets = values.value_offsets();
+ let data = values.value_data();
+ (offset..offset + len).map(move |idx| dense_byte_value::<T>(offsets, data,
idx))
+}
+
+#[inline]
+fn update_statistics<T>(encoder: &mut ByteArrayEncoder, values: impl
Iterator<Item = T>)
+where
+ T: Copy + Ord + AsRef<[u8]>,
{
- let first_idx = valid.next()?;
+ if let Some(accumulator) = encoder.geo_stats_accumulator.as_mut() {
+ update_geo_stats_accumulator(accumulator.as_mut(), values);
+ } else if let Some((min, max)) = compute_min_max(values) {
+ if encoder.min_value.as_ref().is_none_or(|m| m > &min) {
+ encoder.min_value = Some(min);
+ }
- let first_val = array.value(first_idx);
+ if encoder.max_value.as_ref().is_none_or(|m| m < &max) {
+ encoder.max_value = Some(max);
+ }
+ }
+}
+
+#[inline]
+fn update_bloom_filter<T>(bloom_filter: &mut Sbbf, values: impl Iterator<Item
= T>)
+where
+ T: AsRef<[u8]>,
+{
+ for value in values {
+ bloom_filter.insert(value.as_ref());
+ }
+}
+
+/// Computes the min and max for the provided values
+#[inline]
Review Comment:
I wonder if you foudn that inlining actually helps in all these cases? We
have found that in some cases inlining actually makes the performance worse (as
there are a bunch of optimizations in LLVM that are disabled once the function
gets too big)
##########
parquet/src/arrow/arrow_writer/levels.rs:
##########
@@ -859,6 +915,163 @@ impl LevelData {
}
}
+/// Zero-allocation iterator over the indices in a [`ValueSelection`].
Review Comment:
this now has dispatch overhead for each item (the `match self`) -- we could
probably make it even faster by changing the callsite to instantiate different
loops based on the different iterators rather than checking each value 🤔
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]