mapleFU commented on code in PR #6159:
URL: https://github.com/apache/arrow-rs/pull/6159#discussion_r1704837283
##########
parquet/src/encodings/encoding/byte_stream_split_encoder.rs:
##########
@@ -96,3 +111,121 @@ impl<T: DataType> Encoder<T> for ByteStreamSplitEncoder<T>
{
self.buffer.capacity() * std::mem::size_of::<u8>()
}
}
+
+pub struct VariableWidthByteStreamSplitEncoder<T> {
+ buffer: Vec<u8>,
+ type_width: usize,
+ _p: PhantomData<T>,
+}
+
+impl<T: DataType> VariableWidthByteStreamSplitEncoder<T> {
+ pub(crate) fn new(type_length: i32) -> Self {
+ Self {
+ buffer: Vec::new(),
+ type_width: type_length as usize,
+ _p: PhantomData,
+ }
+ }
+}
+
+fn put_fixed<T: DataType, const TYPE_SIZE: usize>(dst: &mut [u8], values:
&[T::T]) {
+ let mut idx = 0;
+ values.iter().for_each(|x| {
+ let bytes = x.as_bytes();
+ if bytes.len() != TYPE_SIZE {
+ panic!(
+ "Mismatched FixedLenByteArray sizes: {} != {}",
+ bytes.len(),
+ TYPE_SIZE
+ );
+ }
+ dst[idx..(TYPE_SIZE + idx)].copy_from_slice(&bytes[..TYPE_SIZE]);
+ idx += TYPE_SIZE;
+ });
+}
+
+fn put_variable<T: DataType>(dst: &mut [u8], values: &[T::T], type_width:
usize) {
+ let mut idx = 0;
+ values.iter().for_each(|x| {
+ let bytes = x.as_bytes();
+ if bytes.len() != type_width {
+ panic!(
+ "Mismatched FixedLenByteArray sizes: {} != {}",
+ bytes.len(),
+ type_width
+ );
+ }
+ dst[idx..idx + type_width].copy_from_slice(bytes);
+ idx += type_width;
+ });
+}
+
+impl<T: DataType> Encoder<T> for VariableWidthByteStreamSplitEncoder<T> {
+ fn put(&mut self, values: &[T::T]) -> Result<()> {
+ ensure_phys_ty!(
+ Type::FIXED_LEN_BYTE_ARRAY,
+ "VariableWidthByteStreamSplitEncoder only supports
FixedLenByteArray types"
+ );
+
+ // FixedLenByteArray is implemented as ByteArray, so there may be gaps
making
+ // slice_as_bytes untenable
+ let idx = self.buffer.len();
+ let data_len = values.len() * self.type_width;
+ // Ensure enough capacity for the new data
+ self.buffer.reserve(values.len() * self.type_width);
+ // ...and extend the size of buffer to allow direct access
+ self.buffer.put_bytes(0_u8, data_len);
+ // Get a slice of the buffer corresponding to the location of the new
data
+ let out_buf = &mut self.buffer[idx..idx + data_len];
+
+ // Now copy `values` into the buffer. For `type_width` <= 8 use a
fixed size when
+ // performing the copy as it is significantly faster.
+ match self.type_width {
+ 2 => put_fixed::<T, 2>(out_buf, values),
Review Comment:
Still wondering why `type_width` <= 8 by hande
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]