mapleFU commented on issue #14923:
URL: https://github.com/apache/arrow/issues/14923#issuecomment-1366427048
Cannot reproduce this problem on my macos, guess maybe the problem comes
from SIMD unpack or other. I will keep the diff and try it on my pc tonight.
```diff
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 9761dfd30..06cf385cc 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2248,9 +2248,9 @@ void DeltaBitPackEncoder<DType>::FlushBlock() {
// needed to store the values, the bytes storing the bit widths of the
unneeded
// miniblocks are still present, their value should be zero, but readers
must accept
// arbitrary values as well.
- for (uint32_t i = num_miniblocks; i < mini_blocks_per_block_; i++) {
- bit_width_data[i] = 0;
- }
+ // for (uint32_t i = num_miniblocks; i < mini_blocks_per_block_; i++) {
+ // bit_width_data[i] = 0;
+ // }
DCHECK_EQ(values_current_block_, 0);
bit_writer_.Flush();
diff --git a/cpp/src/parquet/encoding_test.cc
b/cpp/src/parquet/encoding_test.cc
index 3b4cafab8..26a5c8d5b 100644
--- a/cpp/src/parquet/encoding_test.cc
+++ b/cpp/src/parquet/encoding_test.cc
@@ -1057,6 +1057,42 @@ TEST_F(DictEncoding, CheckDecodeIndicesNoNulls) {
CheckDict(actual_num_values, *builder);
}
+class DeltaBitPackEncoding : public TestArrowBuilderDecoding {
+ public:
+ void SetupEncoderDecoder() override {}
+};
+
+TEST_F(DeltaBitPackEncoding, Example) {
+ std::shared_ptr<ColumnDescriptor> descr_ = ExampleDescr<Int32Type>();
+ auto decoder = MakeTypedDecoder<Int32Type>(Encoding::DELTA_BINARY_PACKED,
descr_.get());
+ using c_type = parquet::Int32Type::c_type;
+
+ unsigned char good_data [] =
"\200\001\004A\237\224\316\362\r\242\220\203- ";
+ int encode_buffer_size = 273;
+ int num_values = 65;
+ std::vector<uint8_t> output_bytes = std::vector<uint8_t>(num_values *
sizeof(c_type));
+ auto decode_buf = reinterpret_cast<c_type*>(output_bytes.data());
+ decoder->SetData(num_values, &good_data[0], encode_buffer_size);
+ int values_decoded = decoder->Decode(decode_buf, num_values);
+ ASSERT_EQ(num_values, values_decoded);
+
+ unsigned char bad_data[] =
+ "\200\001\004A\237\224\316\362\r\242\220\203- "
+
"\245\245\304;`\210'\313\r\270D\316\306h㖀~\372\255\360A\254}\211L\343\373_"
+ "\034®\312Y\036\233<\203\035P\202)\307Y\356\327\024\302!\232\036,"
+
"\271\b\331\353\037e\333\332\315Crm\203\350בOo\001\347\305Z\203G\037\263Y\254\366_"
+
"\"\v\276\242Y\002\374\300\226\231\252C\240\363ۙ\r\334E\314\f\002\255\227\273\307"
+ "\305'\"\033\235\374\250\243\244F\266\254\350\203\304U\036X\331&\210/"
+ "\037\322\321s.\031e/"
+
"\232\340\363\366\306\030\243,5\337\031\005bw\021\017wj\003#Q`\371ʉ\323\300+~="
+ "\232W\232\374p\336$\022\211VQ\237>\v1gە'\224\207\262f\247h\363A!"
+ "\255\271f\026\274\033_\333)4";
+ output_bytes = std::vector<uint8_t>(num_values * sizeof(c_type));
+ decode_buf = reinterpret_cast<c_type*>(output_bytes.data());
+ decoder->SetData(num_values, &bad_data[0], encode_buffer_size);
+}
+
+
// ----------------------------------------------------------------------
// BYTE_STREAM_SPLIT encode/decode tests.
@@ -1388,6 +1424,7 @@ TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
ASSERT_NO_FATAL_FAILURE(
this->Execute((values_per_mini_block * values_per_block) + 1, 10));
ASSERT_NO_FATAL_FAILURE(this->Execute(0, 0));
+ ASSERT_NO_FATAL_FAILURE(this->Execute(65, 1));
ASSERT_NO_FATAL_FAILURE(this->ExecuteSpaced(
/*nvalues*/ 1234, /*repeats*/ 1, /*valid_bits_offset*/ 64,
/*null_probability*/ 0.1));
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]