This is an automated email from the ASF dual-hosted git repository.
wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c31fb46544 GH-15173: [C++][Parquet] Fixing ByteStreamSplit Standard
broken (#34140)
c31fb46544 is described below
commit c31fb46544b9c8372e799138bad9223162169473
Author: mwish <[email protected]>
AuthorDate: Sat Feb 18 00:08:19 2023 +0800
GH-15173: [C++][Parquet] Fixing ByteStreamSplit Standard broken (#34140)
### Rationale for this change
This patch fixes bugs in BYTE_STREAM_SPLIT. It enforce the parquet
ByteStreamSplit not to use padding, and force using `sizeof(data) / sizeof(T)`
as num-of-values.
### What changes are included in this PR?
1. Update num-of-values in ByteStreamSplit
2. Add testing
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
* Closes: #15173
Authored-by: mwish <[email protected]>
Signed-off-by: Will Jones <[email protected]>
---
cpp/src/parquet/encoding.cc | 15 ++++++++++----
cpp/src/parquet/encoding_test.cc | 45 ++++++++++++++++++++++++++++++++++++++++
2 files changed, 56 insertions(+), 4 deletions(-)
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 3ed6d2f0b9..a47dc6b4c0 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2937,11 +2937,18 @@
ByteStreamSplitDecoder<DType>::ByteStreamSplitDecoder(const ColumnDescriptor* de
template <typename DType>
void ByteStreamSplitDecoder<DType>::SetData(int num_values, const uint8_t*
data,
int len) {
- DecoderImpl::SetData(num_values, data, len);
- if (num_values * static_cast<int64_t>(sizeof(T)) > len) {
- throw ParquetException("Data size too small for number of values
(corrupted file?)");
+ if (num_values * static_cast<int64_t>(sizeof(T)) < len) {
+ throw ParquetException(
+ "Data size too large for number of values (padding in byte stream
split data "
+ "page?)");
+ }
+ if (len % sizeof(T) != 0) {
+ throw ParquetException("ByteStreamSplit data size " + std::to_string(len) +
+ " not aligned with type " +
TypeToString(DType::type_num));
}
- num_values_in_buffer_ = num_values;
+ num_values = len / sizeof(T);
+ DecoderImpl::SetData(num_values, data, len);
+ num_values_in_buffer_ = num_values_;
}
template <typename DType>
diff --git a/cpp/src/parquet/encoding_test.cc b/cpp/src/parquet/encoding_test.cc
index ac3ea1118b..1500ba3c80 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -1125,6 +1125,29 @@ class TestByteStreamSplitEncoding : public
TestEncodingBase<Type> {
}
}
+ void CheckRoundtripSpaced(const uint8_t* valid_bits,
+ int64_t valid_bits_offset) override {
+ auto encoder =
+ MakeTypedEncoder<Type>(Encoding::BYTE_STREAM_SPLIT, false,
descr_.get());
+ auto decoder = MakeTypedDecoder<Type>(Encoding::BYTE_STREAM_SPLIT,
descr_.get());
+ int null_count = 0;
+ for (auto i = 0; i < num_values_; i++) {
+ if (!bit_util::GetBit(valid_bits, valid_bits_offset + i)) {
+ null_count++;
+ }
+ }
+
+ encoder->PutSpaced(draws_, num_values_, valid_bits, valid_bits_offset);
+ encode_buffer_ = encoder->FlushValues();
+ decoder->SetData(num_values_, encode_buffer_->data(),
+ static_cast<int>(encode_buffer_->size()));
+ auto values_decoded = decoder->DecodeSpaced(decode_buf_, num_values_,
null_count,
+ valid_bits, valid_bits_offset);
+ ASSERT_EQ(num_values_, values_decoded);
+ ASSERT_NO_FATAL_FAILURE(VerifyResultsSpaced<c_type>(decode_buf_, draws_,
num_values_,
+ valid_bits,
valid_bits_offset));
+ }
+
void CheckDecode();
void CheckEncode();
@@ -1247,6 +1270,28 @@ TYPED_TEST(TestByteStreamSplitEncoding,
RoundTripSingleElement) {
ASSERT_NO_FATAL_FAILURE(this->Execute(1, 1));
}
+TYPED_TEST(TestByteStreamSplitEncoding, RoundTripSpace) {
+ ASSERT_NO_FATAL_FAILURE(this->Execute(10000, 1));
+
+ // Spaced test with different sizes and offset to guarantee SIMD
implementation
+ constexpr int kAvx512Size = 64; // sizeof(__m512i) for Avx512
+ constexpr int kSimdSize = kAvx512Size; // Current the max is Avx512
+ constexpr int kMultiSimdSize = kSimdSize * 33;
+
+ for (auto null_prob : {0.001, 0.1, 0.5, 0.9, 0.999}) {
+ // Test with both size and offset up to 3 Simd block
+ for (auto i = 1; i < kSimdSize * 3; i++) {
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(i, 1, 0, null_prob));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(i, 1, i + 1, null_prob));
+ }
+ // Large block and offset
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize, 1, 0,
null_prob));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize + 33, 1, 0,
null_prob));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize, 1, 33,
null_prob));
+ ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(kMultiSimdSize + 33, 1, 33,
null_prob));
+ }
+}
+
TYPED_TEST(TestByteStreamSplitEncoding, CheckOnlyDecode) {
ASSERT_NO_FATAL_FAILURE(this->CheckDecode());
}