[GitHub] [arrow] wgtmac commented on a diff in pull request #37400: GH-34785: [C++][Parquet] Parquet Bloom Filter Writer Implementation

via GitHub Thu, 07 Sep 2023 08:58:46 -0700


wgtmac commented on code in PR #37400:
URL: https://github.com/apache/arrow/pull/37400#discussion_r1318122631



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5584,5 +5592,95 @@ TEST_F(ParquetPageIndexRoundTripTest, EnablePerColumn) {
                             /*null_counts=*/{0}}));
 }
 
+class ParquetBloomFilterRoundTripTest : public ::testing::Test,
+                                        public ParquetIndexRoundTripTest {
+ public:
+  void ReadBloomFilters(int expect_num_row_groups,
+                        const std::set<int>& expect_columns_without_filter = 
{}) {
+    auto read_properties = default_arrow_reader_properties();
+    auto reader = 
ParquetFileReader::Open(std::make_shared<BufferReader>(buffer_));
+
+    auto metadata = reader->metadata();
+    ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups());
+
+    auto& bloom_filter_reader = reader->GetBloomFilterReader();
+
+    for (int rg = 0; rg < metadata->num_row_groups(); ++rg) {
+      auto row_group_reader = bloom_filter_reader.RowGroup(rg);
+      ASSERT_NE(row_group_reader, nullptr);
+
+      for (int col = 0; col < metadata->num_columns(); ++col) {
+        bool expect_no_bloom_filter = expect_columns_without_filter.find(col) 
!=
+                                      expect_columns_without_filter.cend();
+
+        auto bloom_filter = row_group_reader->GetColumnBloomFilter(col);
+        if (expect_no_bloom_filter) {
+          ASSERT_EQ(bloom_filter, nullptr);
+        } else {
+          bloom_filters_.push_back(std::move(bloom_filter));
+        }
+      }
+    }
+  }
+
+  template <typename ArrowType>
+  void VerifyBloomFilter(const BloomFilter* bloom_filter,
+                         const ::arrow::ChunkedArray& chunked_array) {
+    for (auto value : ::arrow::stl::Iterate<ArrowType>(chunked_array)) {
+      if (value == std::nullopt) {
+        continue;
+      }
+      EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(value.value())));
+    }
+  }
+
+ protected:
+  std::vector<std::unique_ptr<BloomFilter>> bloom_filters_;
+};
+
+TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) {
+  BloomFilterOptions options;
+  options.ndv = 100;
+  auto writer_properties = WriterProperties::Builder()
+                               .enable_bloom_filter_options(options, "c0")
+                               ->enable_bloom_filter_options(options, "c1")
+                               ->max_row_group_length(4)
+                               ->build();
+  auto schema = ::arrow::schema(
+      {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", 
::arrow::utf8())});
+  auto table = ::arrow::TableFromJSON(schema, {R"([

Review Comment:
   Could you please prettify the JSON?



##########
cpp/src/parquet/page_index.h:
##########
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include "arrow/io/interfaces.h"
+#include "arrow/io/type_fwd.h"

Review Comment:
   Why need this?



##########
cpp/src/parquet/properties.h:
##########
@@ -139,6 +139,18 @@ static constexpr Encoding::type DEFAULT_ENCODING = 
Encoding::UNKNOWN;
 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = 
Compression::UNCOMPRESSED;
 static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
+static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
+static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;
+
+struct BloomFilterOptions {

Review Comment:
   PARQUET_EXPORT ?



##########
cpp/src/parquet/properties.h:
##########
@@ -532,6 +563,43 @@ class PARQUET_EXPORT WriterProperties {
       return this->disable_statistics(path->ToDotString());
     }
 
+    /// Disable bloom filter for the column specified by `path`.
+    /// Default disabled.
+    Builder* disable_bloom_filter(const std::string& path) {
+      bloom_filter_options_[path] = std::nullopt;
+      return this;
+    }
+
+    /// Disable bloom filter for the column specified by `path`.
+    /// Default enabled.
+    Builder* disable_bloom_filter(const std::shared_ptr<schema::ColumnPath>& 
path) {
+      return this->disable_bloom_filter(path->ToDotString());
+    }
+
+    /// Enable bloom filter options for the column specified by `path`.
+    ///
+    /// Default disabled.
+    ///
+    /// Note: Currently we don't support bloom filter for boolean columns,
+    /// so if enable bloom filter for boolean columns, it will be ignored.
+    Builder* enable_bloom_filter_options(BloomFilterOptions 
bloom_filter_options,
+                                         const std::string& path) {
+      bloom_filter_options_[path] = bloom_filter_options;
+      return this;
+    }
+
+    /// Enable bloom filter options for the column specified by `path`.
+    ///
+    /// Default disabled.
+    ///
+    /// Note: Currently we don't support bloom filter for boolean columns,

Review Comment:
   ```suggestion
       /// Note: the parquet specs does not support bloom filter for boolean 
columns,
   ```



##########
cpp/src/parquet/properties.h:
##########
@@ -147,7 +159,9 @@ class PARQUET_EXPORT ColumnProperties {
                    bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
                    bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
                    size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE,
-                   bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED)
+                   bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED,
+                   std::optional<BloomFilterOptions> bloom_filter_options =

Review Comment:
   This is unused any more.



##########
cpp/src/parquet/properties.h:
##########
@@ -757,6 +829,18 @@ class PARQUET_EXPORT WriterProperties {
     return false;
   }
 
+  bool bloom_filter_enabled() const {
+    // Note: we disallow enable bloom filter by default for now.

Review Comment:
   Perhaps saying `We do not encourage enabling bloom filter for all columns. 
So default_column_properties_.bloom_filter_enabled is always false and cannot 
be altered by user. Thus we can safely skip checking it here.`



##########
cpp/src/parquet/file_writer.cc:
##########
@@ -360,6 +384,10 @@ class FileSerializer : public ParquetFileWriter::Contents {
       }
       row_group_writer_.reset();
 
+      // In Parquet standard, the Bloom filter data can be stored before the 
page indexes
+      // after all row groups or stored between row groups. We choose to store 
it before
+      // the page indexes after all row groups.

Review Comment:
   Putting all bloom filters together may provide a good chance to coalesce 
I/Os of different bloom filters. Especially when only one column has enabled 
it, which is the common case.



##########
cpp/src/parquet/column_writer.h:
##########
@@ -143,7 +143,8 @@ class PARQUET_EXPORT ColumnWriter {
 
   static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
                                             std::unique_ptr<PageWriter>,
-                                            const WriterProperties* 
properties);
+                                            const WriterProperties* properties,
+                                            BloomFilter* bloom_filter = 
NULLPTR);

Review Comment:
   Should we avoid default parameters?



##########
cpp/src/parquet/arrow/arrow_reader_writer_test.cc:
##########
@@ -5584,5 +5592,104 @@ TEST_F(ParquetPageIndexRoundTripTest, EnablePerColumn) {
                             /*null_counts=*/{0}}));
 }
 
+class ParquetBloomFilterRoundTripTest : public ::testing::Test,

Review Comment:
   Create an issue to keep track of it?



##########
cpp/src/parquet/properties.h:
##########
@@ -139,6 +139,18 @@ static constexpr Encoding::type DEFAULT_ENCODING = 
Encoding::UNKNOWN;
 static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
 static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = 
Compression::UNCOMPRESSED;
 static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = false;
+static constexpr int32_t DEFAULT_BLOOM_FILTER_NDV = 1024 * 1024;
+static constexpr double DEFAULT_BLOOM_FILTER_FPP = 0.05;
+
+struct BloomFilterOptions {
+  /// The number of distinct values to expect to be inserted into the bloom.
+  int32_t ndv = DEFAULT_BLOOM_FILTER_NDV;
+  /// The false positive probability expected from the bloom.
+  double fpp = DEFAULT_BLOOM_FILTER_FPP;
+};
+
+static constexpr std::optional<BloomFilterOptions> 
DEFAULT_IS_BLOOM_FILTER_OPTIONS =

Review Comment:
   Remove it?



##########
cpp/src/parquet/properties.h:
##########
@@ -186,6 +200,16 @@ class PARQUET_EXPORT ColumnProperties {
     page_index_enabled_ = page_index_enabled;
   }
 
+  void set_bloom_filter_options(std::optional<BloomFilterOptions> 
bloom_filter_options) {
+    if (bloom_filter_options) {
+      if (bloom_filter_options->fpp > 1.0 || bloom_filter_options->fpp < 0.0) {
+        throw ParquetException(
+            "Bloom Filter False positive probability must be between 0.0 and 
1.0");

Review Comment:
   nit: print the input fpp and make it clear the inclusiveness of boundary.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] wgtmac commented on a diff in pull request #37400: GH-34785: [C++][Parquet] Parquet Bloom Filter Writer Implementation

Reply via email to