alamb commented on code in PR #6159:
URL: https://github.com/apache/arrow-rs/pull/6159#discussion_r1697535102
##########
parquet/src/encodings/decoding/byte_stream_split_decoder.rs:
##########
@@ -93,15 +132,30 @@ impl<T: DataType> Decoder<T> for ByteStreamSplitDecoder<T>
{
stride,
self.values_decoded,
),
- _ => {
- return Err(general_err!(
- "byte stream split unsupported for data types of size {}
bytes",
- type_size
- ));
- }
+ 16 => join_streams_const::<16>(
+ &self.encoded_bytes,
+ raw_out_bytes,
+ stride,
+ self.values_decoded,
+ ),
+ _ => join_streams(
+ &self.encoded_bytes,
+ raw_out_bytes,
+ stride,
+ type_size,
+ self.values_decoded,
+ ),
}
self.values_decoded += num_values;
+ // FIXME(ets): there's got to be a better way to do this
Review Comment:
Yeah, I agree this is the thing we should try and figure out
##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -1641,6 +1643,86 @@ mod tests {
assert_eq!(row_count, 300);
}
+ #[test]
+ fn test_read_extended_byte_stream_split() {
+ let path = format!(
Review Comment:
I see this test implements the suggestion from
https://github.com/apache/parquet-testing/blob/master/data/README.md#additional-types
> To check conformance of a BYTE_STREAM_SPLIT decoder, read each
BYTE_STREAM_SPLIT-encoded column and compare the decoded values against the
values from the corresponding PLAIN-encoded column. The values should be equal.
However, when I double checked the vaues with what `pyarrow` python says
they didn't seem to match 🤔
I printed out the f16 column:
```rust
f16_col: PrimitiveArray<Float16>
[
10.3046875,
8.9609375,
10.75,
10.9375,
8.046875,
8.6953125,
10.125,
9.6875,
9.984375,
9.1484375,
...108 elements...,
11.6015625,
9.7578125,
8.9765625,
10.1796875,
10.21875,
11.359375,
10.8359375,
10.359375,
11.4609375,
8.8125,
]
```
```rust
f32_col: PrimitiveArray<Float32>
[
8.827992,
9.48172,
11.511229,
10.637534,
9.301069,
8.986282,
10.032783,
8.78344,
9.328859,
10.31201,
...52 elements...,
7.6898966,
10.054354,
9.528224,
10.459386,
10.701954,
10.138242,
10.760133,
10.229212,
10.530065,
9.295327,
]
```
Here is what python told me:
```python
Python 3.11.9 (main, Apr 2 2024, 08:25:04) [Clang 15.0.0
(clang-1500.3.9.4)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import pyarrow.parquet as pq
>>> table = pq.read_table('byte_stream_split_extended.gzip.parquet')
>>> table
pyarrow.Table
float16_plain: halffloat
float16_byte_stream_split: halffloat
float_plain: float
float_byte_stream_split: float
double_plain: double
double_byte_stream_split: double
int32_plain: int32
int32_byte_stream_split: int32
int64_plain: int64
int64_byte_stream_split: int64
flba5_plain: fixed_size_binary[5]
flba5_byte_stream_split: fixed_size_binary[5]
decimal_plain: decimal128(7, 3)
decimal_byte_stream_split: decimal128(7, 3)
----
float16_plain:
[[18727,18555,18784,18808,18438,...,18573,18770,18637,18687,18667]]
float16_byte_stream_split:
[[18727,18555,18784,18808,18438,...,18573,18770,18637,18687,18667]]
float_plain:
[[10.337575,11.407482,10.090585,10.643939,7.9498277,...,10.138242,10.760133,10.229212,10.530065,9.295327]]
float_byte_stream_split:
[[10.337575,11.407482,10.090585,10.643939,7.9498277,...,10.138242,10.760133,10.229212,10.530065,9.295327]]
double_plain:
[[9.82038858616854,10.196776096656958,10.820528475417419,9.606258827775427,10.521167255732113,...,9.576393393539162,9.47941158714459,10.812601287753644,10.241659719820838,8.225037940357872]]
double_byte_stream_split:
[[9.82038858616854,10.196776096656958,10.820528475417419,9.606258827775427,10.521167255732113,...,9.576393393539162,9.47941158714459,10.812601287753644,10.241659719820838,8.225037940357872]]
int32_plain:
[[24191,41157,7403,79368,64983,...,3584,93802,95977,73925,10300]]
int32_byte_stream_split:
[[24191,41157,7403,79368,64983,...,3584,93802,95977,73925,10300]]
int64_plain:
[[293650000000,41079000000,51248000000,246066000000,572141000000,...,294755000000,343501000000,663621000000,976709000000,836245000000]]
int64_byte_stream_split:
[[293650000000,41079000000,51248000000,246066000000,572141000000,...,294755000000,343501000000,663621000000,976709000000,836245000000]]
```
##########
parquet/src/encodings/decoding.rs:
##########
@@ -1797,6 +1812,27 @@ mod tests {
test_byte_stream_split_decode::<DoubleType>(data);
}
+ #[test]
+ fn test_byte_stream_split_multiple_i32() {
+ let data = vec![
+ vec![
+ i32::from_le_bytes([0xAA, 0xBB, 0xCC, 0xDD]),
+ i32::from_le_bytes([0x00, 0x11, 0x22, 0x33]),
+ ],
+ vec![i32::from_le_bytes([0xA3, 0xB4, 0xC5, 0xD6])],
+ ];
+ test_byte_stream_split_decode::<Int32Type>(data);
+ }
+
+ #[test]
Review Comment:
Perhaps we could also add an error test (aka when the data didn't have the
right sizes
##########
parquet/src/encodings/decoding/byte_stream_split_decoder.rs:
##########
@@ -76,11 +94,32 @@ impl<T: DataType> Decoder<T> for ByteStreamSplitDecoder<T> {
let num_values = buffer.len().min(total_remaining_values);
let buffer = &mut buffer[..num_values];
+ let type_size = match T::get_physical_type() {
Review Comment:
We can probably figure out some way to encode this in the trait -- either
`Decoder` directly or maybe some function
What if we made ByteStreamSpitDecoder also be parameterized in the width in
bytes:
```rust
impl<T: DataType, W: const usize> Decoder<T> for ByteStreamSplitDecoder<T,
W> {
...
```
That woudl require knowing all the possible sizes (is that known aprior?)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]