etseidl commented on code in PR #6159:
URL: https://github.com/apache/arrow-rs/pull/6159#discussion_r1704888127
##########
parquet/src/encodings/encoding/byte_stream_split_encoder.rs:
##########
@@ -96,3 +111,121 @@ impl<T: DataType> Encoder<T> for ByteStreamSplitEncoder<T>
{
self.buffer.capacity() * std::mem::size_of::<u8>()
}
}
+
+pub struct VariableWidthByteStreamSplitEncoder<T> {
+ buffer: Vec<u8>,
+ type_width: usize,
+ _p: PhantomData<T>,
+}
+
+impl<T: DataType> VariableWidthByteStreamSplitEncoder<T> {
+ pub(crate) fn new(type_length: i32) -> Self {
+ Self {
+ buffer: Vec::new(),
+ type_width: type_length as usize,
+ _p: PhantomData,
+ }
+ }
+}
+
+fn put_fixed<T: DataType, const TYPE_SIZE: usize>(dst: &mut [u8], values:
&[T::T]) {
+ let mut idx = 0;
+ values.iter().for_each(|x| {
+ let bytes = x.as_bytes();
+ if bytes.len() != TYPE_SIZE {
+ panic!(
+ "Mismatched FixedLenByteArray sizes: {} != {}",
+ bytes.len(),
+ TYPE_SIZE
+ );
+ }
+ dst[idx..(TYPE_SIZE + idx)].copy_from_slice(&bytes[..TYPE_SIZE]);
+ idx += TYPE_SIZE;
+ });
+}
+
+fn put_variable<T: DataType>(dst: &mut [u8], values: &[T::T], type_width:
usize) {
+ let mut idx = 0;
+ values.iter().for_each(|x| {
+ let bytes = x.as_bytes();
+ if bytes.len() != type_width {
+ panic!(
+ "Mismatched FixedLenByteArray sizes: {} != {}",
+ bytes.len(),
+ type_width
+ );
+ }
+ dst[idx..idx + type_width].copy_from_slice(bytes);
+ idx += type_width;
+ });
+}
+
+impl<T: DataType> Encoder<T> for VariableWidthByteStreamSplitEncoder<T> {
+ fn put(&mut self, values: &[T::T]) -> Result<()> {
+ ensure_phys_ty!(
+ Type::FIXED_LEN_BYTE_ARRAY,
+ "VariableWidthByteStreamSplitEncoder only supports
FixedLenByteArray types"
+ );
+
+ // FixedLenByteArray is implemented as ByteArray, so there may be gaps
making
+ // slice_as_bytes untenable
+ let idx = self.buffer.len();
+ let data_len = values.len() * self.type_width;
+ // Ensure enough capacity for the new data
+ self.buffer.reserve(values.len() * self.type_width);
+ // ...and extend the size of buffer to allow direct access
+ self.buffer.put_bytes(0_u8, data_len);
+ // Get a slice of the buffer corresponding to the location of the new
data
+ let out_buf = &mut self.buffer[idx..idx + data_len];
+
+ // Now copy `values` into the buffer. For `type_width` <= 8 use a
fixed size when
+ // performing the copy as it is significantly faster.
+ match self.type_width {
+ 2 => put_fixed::<T, 2>(out_buf, values),
Review Comment:
I could throw 1 in, but FLBA(1) is kind of weird. An int would be much
faster to deal with for a single byte. I suppose someone might be tempted to
use it for a single (ASCII) character field...UTF8 would need multiple bytes
anyway.
> Still wondering why type_width <= 8 by hande
The reason for the special handling for 2-8 is shown by the
benchmarks...those numbers are basically the current code vs using
`put_variable` exclusively. For `Float16` using `put_fixed` is more than 2X
faster. The speed advantage pretty much goes away at `type_width == 8`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]