pitrou commented on code in PR #38863:
URL: https://github.com/apache/arrow/pull/38863#discussion_r1409071192
##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -136,6 +144,15 @@ BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
bloom_filter.Init(header_buf->data() + header_size, bloom_filter_size);
return bloom_filter;
}
+ if (bloom_filter_length && *bloom_filter_length < bloom_filter_size +
header_size) {
Review Comment:
The code is getting a bit confusing with `bloom_filter_length` vs.
`bloom_filter_size`. Perhaps rename the latter to `bloom_filter_data_size`?
##########
cpp/src/parquet/bloom_filter_reader_test.cc:
##########
@@ -25,31 +25,41 @@
namespace parquet::test {
TEST(BloomFilterReader, ReadBloomFilter) {
- std::string dir_string(parquet::test::get_data_dir());
- std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet";
- auto reader = ParquetFileReader::OpenFile(path, false);
- auto file_metadata = reader->metadata();
- EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
- auto& bloom_filter_reader = reader->GetBloomFilterReader();
- auto row_group_0 = bloom_filter_reader.RowGroup(0);
- ASSERT_NE(nullptr, row_group_0);
- EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
- auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
- ASSERT_NE(nullptr, bloom_filter);
- EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException);
+ struct BloomFilterTestFile {
+ std::string filename;
+ bool has_bloom_filter_length;
+ };
+ std::vector<BloomFilterTestFile> files = {
+ {"data_index_bloom_encoding_stats.parquet", false},
+ {"data_index_bloom_encoding_with_length.parquet", false},
Review Comment:
Why `false` here?
##########
cpp/src/parquet/bloom_filter_reader_test.cc:
##########
@@ -25,31 +25,41 @@
namespace parquet::test {
TEST(BloomFilterReader, ReadBloomFilter) {
- std::string dir_string(parquet::test::get_data_dir());
- std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet";
- auto reader = ParquetFileReader::OpenFile(path, false);
- auto file_metadata = reader->metadata();
- EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
- auto& bloom_filter_reader = reader->GetBloomFilterReader();
- auto row_group_0 = bloom_filter_reader.RowGroup(0);
- ASSERT_NE(nullptr, row_group_0);
- EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
- auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
- ASSERT_NE(nullptr, bloom_filter);
- EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException);
+ struct BloomFilterTestFile {
+ std::string filename;
+ bool has_bloom_filter_length;
+ };
+ std::vector<BloomFilterTestFile> files = {
+ {"data_index_bloom_encoding_stats.parquet", false},
+ {"data_index_bloom_encoding_with_length.parquet", false},
+ };
+ for (const auto& test_file : files) {
+ std::string dir_string(parquet::test::get_data_dir());
+ std::string path = dir_string + "/" + test_file.filename;
+ auto reader = ParquetFileReader::OpenFile(path, false);
Review Comment:
What is `false` here? Let's make code more readable.
```suggestion
auto reader = ParquetFileReader::OpenFile(path, /*xxx=*/false);
```
##########
cpp/src/parquet/bloom_filter_reader_test.cc:
##########
@@ -25,31 +25,41 @@
namespace parquet::test {
TEST(BloomFilterReader, ReadBloomFilter) {
- std::string dir_string(parquet::test::get_data_dir());
- std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet";
- auto reader = ParquetFileReader::OpenFile(path, false);
- auto file_metadata = reader->metadata();
- EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
- auto& bloom_filter_reader = reader->GetBloomFilterReader();
- auto row_group_0 = bloom_filter_reader.RowGroup(0);
- ASSERT_NE(nullptr, row_group_0);
- EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
- auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
- ASSERT_NE(nullptr, bloom_filter);
- EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException);
+ struct BloomFilterTestFile {
+ std::string filename;
+ bool has_bloom_filter_length;
+ };
+ std::vector<BloomFilterTestFile> files = {
+ {"data_index_bloom_encoding_stats.parquet", false},
+ {"data_index_bloom_encoding_with_length.parquet", false},
+ };
+ for (const auto& test_file : files) {
+ std::string dir_string(parquet::test::get_data_dir());
+ std::string path = dir_string + "/" + test_file.filename;
+ auto reader = ParquetFileReader::OpenFile(path, false);
+ auto file_metadata = reader->metadata();
+ EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
+ auto& bloom_filter_reader = reader->GetBloomFilterReader();
+ auto row_group_0 = bloom_filter_reader.RowGroup(0);
+ ASSERT_NE(nullptr, row_group_0);
+ EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
+ auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
+ ASSERT_NE(nullptr, bloom_filter);
+ EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException);
Review Comment:
Same question here.
##########
cpp/src/parquet/bloom_filter_reader_test.cc:
##########
@@ -25,31 +25,41 @@
namespace parquet::test {
TEST(BloomFilterReader, ReadBloomFilter) {
- std::string dir_string(parquet::test::get_data_dir());
- std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet";
- auto reader = ParquetFileReader::OpenFile(path, false);
- auto file_metadata = reader->metadata();
- EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
- auto& bloom_filter_reader = reader->GetBloomFilterReader();
- auto row_group_0 = bloom_filter_reader.RowGroup(0);
- ASSERT_NE(nullptr, row_group_0);
- EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
- auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
- ASSERT_NE(nullptr, bloom_filter);
- EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException);
+ struct BloomFilterTestFile {
+ std::string filename;
+ bool has_bloom_filter_length;
+ };
+ std::vector<BloomFilterTestFile> files = {
+ {"data_index_bloom_encoding_stats.parquet", false},
+ {"data_index_bloom_encoding_with_length.parquet", false},
+ };
+ for (const auto& test_file : files) {
+ std::string dir_string(parquet::test::get_data_dir());
+ std::string path = dir_string + "/" + test_file.filename;
+ auto reader = ParquetFileReader::OpenFile(path, false);
+ auto file_metadata = reader->metadata();
+ EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
+ auto& bloom_filter_reader = reader->GetBloomFilterReader();
+ auto row_group_0 = bloom_filter_reader.RowGroup(0);
+ ASSERT_NE(nullptr, row_group_0);
+ EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
Review Comment:
Why does it throw? Is it possible to check the exception message, or at
least add a comment?
##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -136,6 +144,15 @@ BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(
bloom_filter.Init(header_buf->data() + header_size, bloom_filter_size);
return bloom_filter;
}
+ if (bloom_filter_length && *bloom_filter_length < bloom_filter_size +
header_size) {
Review Comment:
Also, why the inequality? We should have `*bloom_filter_length ==
bloom_filter_data_size + header_size`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]