alamb commented on code in PR #10229:
URL: https://github.com/apache/arrow-rs/pull/10229#discussion_r3492491438
##########
arrow-row/src/fixed.rs:
##########
@@ -469,9 +422,20 @@ where
T::Native: FixedLengthEncoding,
{
assert!(PrimitiveArray::<T>::is_compatible(&data_type));
- // SAFETY:
- // Validated data type above
- unsafe { decode_fixed::<T::Native>(rows, data_type, options).into() }
+
+ let len = rows.len();
+
+ let mut values = BufferBuilder::<T::Native>::new(len);
Review Comment:
We could potentially use Vec<T::Native> here which may be faster
##########
arrow-row/src/fixed.rs:
##########
@@ -402,61 +397,19 @@ pub fn decode_bool(rows: &mut [&[u8]], options:
SortOptions) -> BooleanArray {
values.push(values_packed);
}
- let builder = ArrayDataBuilder::new(DataType::Boolean)
- .len(rows.len())
- .null_count(null_count)
- .add_buffer(values.into())
- .null_bit_buffer(Some(nulls.into()));
+ let nulls = NullBuffer::new(BooleanBuffer::new(nulls.into(), 0, len));
Review Comment:
I think `new` recomputes the null cound
We could use `new_unchecked` here to keep the exact same behavior
https://docs.rs/arrow/latest/arrow/buffer/struct.NullBuffer.html#method.new_unchecked
However it seems from the bencmarks it doesn't really matter (this
formulation is as fast or faster as main)
```
ppend_rows 4096 bool(0, 0.5)
1.05 5.1±0.02µs ?
?/sec 1.00 4.9±0.00µs ? ?/sec
append_rows 4096 bool(0.3, 0.5)
1.00 5.8±0.00µs ?
?/sec 1.00 5.8±0.00µs ? ?/se
```
##########
arrow-row/src/lib.rs:
##########
@@ -2119,17 +2118,16 @@ unsafe fn decode_column(
cols.into_iter().next().unwrap()
}
Codec::Struct(converter, _) => {
- let (null_count, nulls) = fixed::decode_nulls(rows);
+ let nulls = fixed::decode_nulls(rows);
Review Comment:
this is really nice
##########
arrow-row/src/lib.rs:
##########
@@ -2139,14 +2137,15 @@ unsafe fn decode_column(
.collect(),
_ => unreachable!("Only Struct types should be corrected
here"),
};
- let corrected_struct_type =
DataType::Struct(corrected_fields.into());
- let builder = ArrayDataBuilder::new(corrected_struct_type)
- .len(rows.len())
- .null_count(null_count)
- .null_bit_buffer(Some(nulls))
- .child_data(child_data);
-
- Arc::new(StructArray::from(unsafe { builder.build_unchecked() }))
+
+ Arc::new(unsafe {
Review Comment:
I think it would be nice to add a small safety justification here, though I
see the previous code didn't have one either
##########
arrow-row/src/variable.rs:
##########
@@ -69,6 +71,12 @@ pub(crate) fn non_null_padded_length(len: usize) -> usize {
}
}
+pub(crate) fn decode_nulls_sentinel(rows: &[&[u8]], options: SortOptions) ->
Option<NullBuffer> {
+ let nulls = BooleanBuffer::collect_bool(rows.len(), |x| rows[x][0] !=
null_sentinel(options));
Review Comment:
Given there are only two null values for null sentinel, I wonder if we could
make two separate loops here for the two values and generate even more
efficient code
As a follow PR
##########
arrow-row/src/variable.rs:
##########
@@ -69,6 +71,12 @@ pub(crate) fn non_null_padded_length(len: usize) -> usize {
}
}
+pub(crate) fn decode_nulls_sentinel(rows: &[&[u8]], options: SortOptions) ->
Option<NullBuffer> {
Review Comment:
perhaps some small comments explaining what it does would help future readers
##########
arrow-row/src/fixed.rs:
##########
@@ -402,61 +397,19 @@ pub fn decode_bool(rows: &mut [&[u8]], options:
SortOptions) -> BooleanArray {
values.push(values_packed);
}
- let builder = ArrayDataBuilder::new(DataType::Boolean)
- .len(rows.len())
- .null_count(null_count)
- .add_buffer(values.into())
- .null_bit_buffer(Some(nulls.into()));
+ let nulls = NullBuffer::new(BooleanBuffer::new(nulls.into(), 0, len));
+ let nulls = (nulls.null_count() > 0).then_some(nulls);
- // SAFETY:
- // Buffers are the correct length
- unsafe { BooleanArray::from(builder.build_unchecked()) }
+ BooleanArray::new(BooleanBuffer::new(values.into(), 0, len), nulls)
}
/// Decodes a single byte from each row, interpreting `0x01` as a valid value
-/// and all other values as a null
-///
-/// Returns the null count and null buffer
-pub fn decode_nulls(rows: &[&[u8]]) -> (usize, Buffer) {
Review Comment:
I think NullBuffers also contain a null count, however, they compute the
null count with quite optimized code, so I think recomputing it in many cases
is fine
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]