pitrou commented on code in PR #15241:
URL: https://github.com/apache/arrow/pull/15241#discussion_r1066028713
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1427,5 +1429,66 @@ TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
}
}
+TEST(DeltaBitPackEncoding, MalfordMiniblockBitWidth) {
+ std::shared_ptr<ColumnDescriptor> descr = ExampleDescr<Int32Type>();
+ auto decoder = MakeTypedDecoder<Int32Type>(Encoding::DELTA_BINARY_PACKED,
descr.get());
+ using c_type = parquet::Int32Type::c_type;
+
+ int encode_buffer_size = 274;
+ int num_values = 65;
+ c_type* decode_buf = nullptr;
+ std::vector<uint8_t> output_bytes;
+ int values_decoded = 0;
+
+ // Both good_data and bad_data is a DELTA_BINARY_PACKED INT32 Page with 65
values.
Review Comment:
Both of these data are "good", neither is "bad".
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2542,10 +2558,21 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual
public TypedDecoder<DTyp
uint32_t values_per_block_;
uint32_t mini_blocks_per_block_;
uint32_t values_per_mini_block_;
- uint32_t values_current_mini_block_;
uint32_t total_value_count_;
- bool block_initialized_;
+ uint32_t total_value_remaining_;
+ // Remaining values in current mini block. If the current block is the last
mini block,
+ // values_remaining_current_mini_block_ may greater than
total_value_remaining_.
+ uint32_t values_remaining_current_mini_block_;
+
+ // If the page doesn't contain any block, `first_block_initialized_` will
+ // always be false. Otherwise, it will be true when first block initialized.
+ bool first_block_initialized_;
+ // The sum of values up to current block.
Review Comment:
```suggestion
// The number of values up to current mini-block (non including).
```
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2542,10 +2558,21 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual
public TypedDecoder<DTyp
uint32_t values_per_block_;
uint32_t mini_blocks_per_block_;
uint32_t values_per_mini_block_;
- uint32_t values_current_mini_block_;
uint32_t total_value_count_;
- bool block_initialized_;
+ uint32_t total_value_remaining_;
+ // Remaining values in current mini block. If the current block is the last
mini block,
+ // values_remaining_current_mini_block_ may greater than
total_value_remaining_.
+ uint32_t values_remaining_current_mini_block_;
+
+ // If the page doesn't contain any block, `first_block_initialized_` will
+ // always be false. Otherwise, it will be true when first block initialized.
+ bool first_block_initialized_;
+ // The sum of values up to current block.
+ // If the page doesn't contain any value, it will always be 0.
+ // If the page only contains one value, it would be 1 if value loaded.
+ // Otherwise, it would be the sum value up to "current" mini block.
+ uint32_t values_num_up_to_current_mini_block_;
Review Comment:
```suggestion
uint32_t values_up_to_current_mini_block_;
```
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1427,5 +1429,66 @@ TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
}
}
+TEST(DeltaBitPackEncoding, MalfordMiniblockBitWidth) {
+ std::shared_ptr<ColumnDescriptor> descr = ExampleDescr<Int32Type>();
+ auto decoder = MakeTypedDecoder<Int32Type>(Encoding::DELTA_BINARY_PACKED,
descr.get());
+ using c_type = parquet::Int32Type::c_type;
+
+ int encode_buffer_size = 274;
+ int num_values = 65;
+ c_type* decode_buf = nullptr;
+ std::vector<uint8_t> output_bytes;
+ int values_decoded = 0;
+
+ // Both good_data and bad_data is a DELTA_BINARY_PACKED INT32 Page with 65
values.
+ // For needed miniblocks, there bit-widths are all 32.
+ // There bit-widths for unneeded miniblocks are different:
+ // * good_data's unneeded bit-width is 0.
+ // * bad_data's unneeded bit-width is 165.
+ // We use Hex to represent good_data and bad_data.
+ {
+ char good_data_hex[] =
+
"80010441BDA08DB708A488C0F1012020000089F79E53946D93CA242D48B21478591FDE6E4855CC03"
+
"7024A4E8E56E1D20687C8BFB4DE880E73CE2E49E873BEB0B1DAD0E0169C7951C411CA76AD949D8ED"
+
"FD9E07A485DF4410B4486253A6280335BB87494B7D61E40A8D487731BBC83A37E1E8EE2F5431150C"
+
"F46D7224B3C7C271AC3220F2C6663289427E1CE3B00F8C34D5DDA845664AF36F56ACF6FED2339911"
+
"7AB54F6667A80EA78D08DC61B4262CE28960B46B2D93785ED9933175A910415B50B30E9889C70ECD"
+
"84C8856531E78B4E05B2AB27B1233E4BAB722869D785F5B8B1049955EFE200000000C6C2201E1836"
+ "C0579B202A6A7395C761044AD151D5CEE992CBBA5AE937E54F29D28F86CC0627B9D1";
+ unsigned char good_data[274];
Review Comment:
Please don't hard code values like that. Instead you can use a `std::string`
or a `std::vector<uint8_t>`.
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1427,5 +1429,66 @@ TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
}
}
+TEST(DeltaBitPackEncoding, MalfordMiniblockBitWidth) {
+ std::shared_ptr<ColumnDescriptor> descr = ExampleDescr<Int32Type>();
+ auto decoder = MakeTypedDecoder<Int32Type>(Encoding::DELTA_BINARY_PACKED,
descr.get());
+ using c_type = parquet::Int32Type::c_type;
+
+ int encode_buffer_size = 274;
+ int num_values = 65;
+ c_type* decode_buf = nullptr;
+ std::vector<uint8_t> output_bytes;
+ int values_decoded = 0;
+
+ // Both good_data and bad_data is a DELTA_BINARY_PACKED INT32 Page with 65
values.
+ // For needed miniblocks, there bit-widths are all 32.
+ // There bit-widths for unneeded miniblocks are different:
+ // * good_data's unneeded bit-width is 0.
+ // * bad_data's unneeded bit-width is 165.
+ // We use Hex to represent good_data and bad_data.
+ {
+ char good_data_hex[] =
+
"80010441BDA08DB708A488C0F1012020000089F79E53946D93CA242D48B21478591FDE6E4855CC03"
+
"7024A4E8E56E1D20687C8BFB4DE880E73CE2E49E873BEB0B1DAD0E0169C7951C411CA76AD949D8ED"
+
"FD9E07A485DF4410B4486253A6280335BB87494B7D61E40A8D487731BBC83A37E1E8EE2F5431150C"
+
"F46D7224B3C7C271AC3220F2C6663289427E1CE3B00F8C34D5DDA845664AF36F56ACF6FED2339911"
+
"7AB54F6667A80EA78D08DC61B4262CE28960B46B2D93785ED9933175A910415B50B30E9889C70ECD"
+
"84C8856531E78B4E05B2AB27B1233E4BAB722869D785F5B8B1049955EFE200000000C6C2201E1836"
+ "C0579B202A6A7395C761044AD151D5CEE992CBBA5AE937E54F29D28F86CC0627B9D1";
+ unsigned char good_data[274];
+ for (int i = 0; i < 274; ++i) {
+ auto s = ::arrow::ParseHexValue(&good_data_hex[i * 2], &good_data[i]);
+ ASSERT_TRUE(s.ok());
Review Comment:
ASSERT_OK
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1427,5 +1429,66 @@ TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
}
}
+TEST(DeltaBitPackEncoding, MalfordMiniblockBitWidth) {
+ std::shared_ptr<ColumnDescriptor> descr = ExampleDescr<Int32Type>();
+ auto decoder = MakeTypedDecoder<Int32Type>(Encoding::DELTA_BINARY_PACKED,
descr.get());
+ using c_type = parquet::Int32Type::c_type;
+
+ int encode_buffer_size = 274;
+ int num_values = 65;
+ c_type* decode_buf = nullptr;
+ std::vector<uint8_t> output_bytes;
+ int values_decoded = 0;
+
+ // Both good_data and bad_data is a DELTA_BINARY_PACKED INT32 Page with 65
values.
+ // For needed miniblocks, there bit-widths are all 32.
+ // There bit-widths for unneeded miniblocks are different:
+ // * good_data's unneeded bit-width is 0.
+ // * bad_data's unneeded bit-width is 165.
+ // We use Hex to represent good_data and bad_data.
+ {
+ char good_data_hex[] =
+
"80010441BDA08DB708A488C0F1012020000089F79E53946D93CA242D48B21478591FDE6E4855CC03"
+
"7024A4E8E56E1D20687C8BFB4DE880E73CE2E49E873BEB0B1DAD0E0169C7951C411CA76AD949D8ED"
+
"FD9E07A485DF4410B4486253A6280335BB87494B7D61E40A8D487731BBC83A37E1E8EE2F5431150C"
+
"F46D7224B3C7C271AC3220F2C6663289427E1CE3B00F8C34D5DDA845664AF36F56ACF6FED2339911"
+
"7AB54F6667A80EA78D08DC61B4262CE28960B46B2D93785ED9933175A910415B50B30E9889C70ECD"
+
"84C8856531E78B4E05B2AB27B1233E4BAB722869D785F5B8B1049955EFE200000000C6C2201E1836"
+ "C0579B202A6A7395C761044AD151D5CEE992CBBA5AE937E54F29D28F86CC0627B9D1";
+ unsigned char good_data[274];
+ for (int i = 0; i < 274; ++i) {
+ auto s = ::arrow::ParseHexValue(&good_data_hex[i * 2], &good_data[i]);
+ ASSERT_TRUE(s.ok());
+ }
+
+ output_bytes = std::vector<uint8_t>(num_values * sizeof(c_type));
+ decode_buf = reinterpret_cast<c_type*>(output_bytes.data());
+ decoder->SetData(num_values, &good_data[0], encode_buffer_size);
+ values_decoded = decoder->Decode(decode_buf, num_values);
+ ASSERT_EQ(num_values, values_decoded);
+ }
+
+ {
+ char bad_data_hex[] =
+
"80010441BDA08DB708A488C0F1012020A5A589F79E53946D93CA242D48B21478591FDE6E4855CC03"
+
"7024A4E8E56E1D20687C8BFB4DE880E73CE2E49E873BEB0B1DAD0E0169C7951C411CA76AD949D8ED"
+
"FD9E07A485DF4410B4486253A6280335BB87494B7D61E40A8D487731BBC83A37E1E8EE2F5431150C"
+
"F46D7224B3C7C271AC3220F2C6663289427E1CE3B00F8C34D5DDA845664AF36F56ACF6FED2339911"
+
"7AB54F6667A80EA78D08DC61B4262CE28960B46B2D93785ED9933175A910415B50B30E9889C70ECD"
+
"84C8856531E78B4E05B2AB27B1233E4BAB722869D785F5B8B1049955EFE200000000C6C2201E1836"
+ "C0579B202A6A7395C761044AD151D5CEE992CBBA5AE937E54F29D28F86CC0627B9D1";
+ unsigned char bad_data[274];
+ for (int i = 0; i < 274; ++i) {
Review Comment:
Please don't copy/paste code while it could trivially be factored out.
##########
cpp/src/parquet/encoding.cc:
##########
@@ -2542,10 +2558,21 @@ class DeltaBitPackDecoder : public DecoderImpl, virtual
public TypedDecoder<DTyp
uint32_t values_per_block_;
uint32_t mini_blocks_per_block_;
uint32_t values_per_mini_block_;
- uint32_t values_current_mini_block_;
uint32_t total_value_count_;
- bool block_initialized_;
+ uint32_t total_value_remaining_;
Review Comment:
```suggestion
uint32_t total_values_remaining_;
```
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1427,5 +1429,66 @@ TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
}
}
+TEST(DeltaBitPackEncoding, MalfordMiniblockBitWidth) {
+ std::shared_ptr<ColumnDescriptor> descr = ExampleDescr<Int32Type>();
+ auto decoder = MakeTypedDecoder<Int32Type>(Encoding::DELTA_BINARY_PACKED,
descr.get());
+ using c_type = parquet::Int32Type::c_type;
+
+ int encode_buffer_size = 274;
+ int num_values = 65;
+ c_type* decode_buf = nullptr;
+ std::vector<uint8_t> output_bytes;
+ int values_decoded = 0;
+
+ // Both good_data and bad_data is a DELTA_BINARY_PACKED INT32 Page with 65
values.
+ // For needed miniblocks, there bit-widths are all 32.
+ // There bit-widths for unneeded miniblocks are different:
+ // * good_data's unneeded bit-width is 0.
+ // * bad_data's unneeded bit-width is 165.
+ // We use Hex to represent good_data and bad_data.
+ {
+ char good_data_hex[] =
+
"80010441BDA08DB708A488C0F1012020000089F79E53946D93CA242D48B21478591FDE6E4855CC03"
+
"7024A4E8E56E1D20687C8BFB4DE880E73CE2E49E873BEB0B1DAD0E0169C7951C411CA76AD949D8ED"
+
"FD9E07A485DF4410B4486253A6280335BB87494B7D61E40A8D487731BBC83A37E1E8EE2F5431150C"
+
"F46D7224B3C7C271AC3220F2C6663289427E1CE3B00F8C34D5DDA845664AF36F56ACF6FED2339911"
+
"7AB54F6667A80EA78D08DC61B4262CE28960B46B2D93785ED9933175A910415B50B30E9889C70ECD"
+
"84C8856531E78B4E05B2AB27B1233E4BAB722869D785F5B8B1049955EFE200000000C6C2201E1836"
+ "C0579B202A6A7395C761044AD151D5CEE992CBBA5AE937E54F29D28F86CC0627B9D1";
+ unsigned char good_data[274];
+ for (int i = 0; i < 274; ++i) {
+ auto s = ::arrow::ParseHexValue(&good_data_hex[i * 2], &good_data[i]);
+ ASSERT_TRUE(s.ok());
+ }
+
+ output_bytes = std::vector<uint8_t>(num_values * sizeof(c_type));
+ decode_buf = reinterpret_cast<c_type*>(output_bytes.data());
+ decoder->SetData(num_values, &good_data[0], encode_buffer_size);
+ values_decoded = decoder->Decode(decode_buf, num_values);
+ ASSERT_EQ(num_values, values_decoded);
+ }
+
+ {
+ char bad_data_hex[] =
+
"80010441BDA08DB708A488C0F1012020A5A589F79E53946D93CA242D48B21478591FDE6E4855CC03"
+
"7024A4E8E56E1D20687C8BFB4DE880E73CE2E49E873BEB0B1DAD0E0169C7951C411CA76AD949D8ED"
+
"FD9E07A485DF4410B4486253A6280335BB87494B7D61E40A8D487731BBC83A37E1E8EE2F5431150C"
+
"F46D7224B3C7C271AC3220F2C6663289427E1CE3B00F8C34D5DDA845664AF36F56ACF6FED2339911"
+
"7AB54F6667A80EA78D08DC61B4262CE28960B46B2D93785ED9933175A910415B50B30E9889C70ECD"
+
"84C8856531E78B4E05B2AB27B1233E4BAB722869D785F5B8B1049955EFE200000000C6C2201E1836"
+ "C0579B202A6A7395C761044AD151D5CEE992CBBA5AE937E54F29D28F86CC0627B9D1";
Review Comment:
Is this different from the other data above? IIUC, the trailing bytes should
be different, but they seem identical.
##########
cpp/src/parquet/encoding_test.cc:
##########
@@ -1427,5 +1429,66 @@ TYPED_TEST(TestDeltaBitPackEncoding, BasicRoundTrip) {
}
}
+TEST(DeltaBitPackEncoding, MalfordMiniblockBitWidth) {
+ std::shared_ptr<ColumnDescriptor> descr = ExampleDescr<Int32Type>();
+ auto decoder = MakeTypedDecoder<Int32Type>(Encoding::DELTA_BINARY_PACKED,
descr.get());
+ using c_type = parquet::Int32Type::c_type;
+
+ int encode_buffer_size = 274;
+ int num_values = 65;
+ c_type* decode_buf = nullptr;
+ std::vector<uint8_t> output_bytes;
+ int values_decoded = 0;
+
+ // Both good_data and bad_data is a DELTA_BINARY_PACKED INT32 Page with 65
values.
+ // For needed miniblocks, there bit-widths are all 32.
+ // There bit-widths for unneeded miniblocks are different:
+ // * good_data's unneeded bit-width is 0.
+ // * bad_data's unneeded bit-width is 165.
+ // We use Hex to represent good_data and bad_data.
+ {
+ char good_data_hex[] =
Review Comment:
Should use `std::string` or `std::string_view` to have length information.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]