alamb commented on code in PR #10250:
URL: https://github.com/apache/arrow-rs/pull/10250#discussion_r3509192249
##########
arrow-row/src/variable.rs:
##########
@@ -309,42 +309,40 @@ pub fn decode_binary<I: OffsetSizeTrait>(
}
}
-fn decode_binary_view_inner(
+fn decode_binary_view_inner<const VALIDATE_UTF8: bool>(
rows: &mut [&[u8]],
options: SortOptions,
- validate_utf8: bool,
) -> BinaryViewArray {
let len = rows.len();
let inline_str_max_len = MAX_INLINE_VIEW_LEN as usize;
let nulls = decode_nulls_sentinel(rows, options);
- // If we are validating UTF-8, decode all string values (including short
strings)
- // into the values buffer and validate UTF-8 once. If not validating,
- // we save memory by only copying long strings to the values buffer, as
short strings
- // will be inlined into the view and do not need to be stored redundantly.
- let values_capacity = if validate_utf8 {
- // Capacity for all long and short strings
- rows.iter().map(|row| decoded_len(row, options)).sum()
+ // Capacity for all long strings plus room for one short string
+ let mut values_capacity = inline_str_max_len;
+ let mut inline_capacity = 0;
+ for row in rows.iter() {
+ let len = decoded_len(row, options);
+ if len > inline_str_max_len {
+ values_capacity += len;
+ } else if VALIDATE_UTF8 {
+ inline_capacity += len;
+ }
+ }
+ let mut values = MutableBuffer::new(values_capacity);
+ let mut view_utf8_validation_buffer = if VALIDATE_UTF8 {
+ Vec::with_capacity(inline_capacity)
Review Comment:
so this is a new allocation, but given the benchmark it seems to be more
than paid for with the additional performance
##########
arrow-row/src/lib.rs:
##########
@@ -4819,66 +4819,6 @@ mod tests {
}
}
- #[test]
- fn test_values_buffer_smaller_when_utf8_validation_disabled() {
Review Comment:
maybe it is worth updating this test to verify that when utf8 validation is
enabled, the values only contain data for the long strings (not the short
strings(
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]