pitrou commented on code in PR #50008:
URL: https://github.com/apache/arrow/pull/50008#discussion_r3381100483


##########
cpp/src/parquet/bloom_filter_reader_writer_test.cc:
##########
@@ -150,6 +150,100 @@ TEST(BloomFilterBuilder, BasicRoundTrip) {
   }
 }
 
+namespace {
+
+struct BloomFilterBuilderFoldingTestCase {
+  int64_t ndv;
+  bool fold;
+  int32_t inserted_count;
+  int64_t expected_bitset_ndv;
+};
+
+class BloomFilterBuilderFoldingTest
+    : public ::testing::TestWithParam<BloomFilterBuilderFoldingTestCase> {};
+
+}  // namespace
+
+TEST_P(BloomFilterBuilderFoldingTest, RespectsOption) {
+  const auto& test_case = GetParam();
+
+  SchemaDescriptor schema;
+  schema::NodePtr root =
+      schema::GroupNode::Make("schema", Repetition::REPEATED, 
{schema::ByteArray("c1")});
+  schema.Init(root);
+
+  constexpr double kFpp = 0.05;
+  BloomFilterOptions bloom_filter_options{
+      .ndv = test_case.ndv, .fpp = kFpp, .fold = test_case.fold};
+  const auto initial_bitset_size = BlockSplitBloomFilter::OptimalNumOfBytes(
+      bloom_filter_options.ndv.value(), bloom_filter_options.fpp);
+  WriterProperties::Builder properties_builder;
+  properties_builder.enable_bloom_filter("c1", bloom_filter_options);
+  auto writer_properties = properties_builder.build();
+  auto bloom_filter_builder = BloomFilterBuilder::Make(&schema, 
writer_properties.get());
+
+  bloom_filter_builder->AppendRowGroup();
+  auto bloom_filter = 
bloom_filter_builder->CreateBloomFilter(/*column_ordinal=*/0);
+  ASSERT_NE(bloom_filter, nullptr);
+  ASSERT_EQ(initial_bitset_size, bloom_filter->GetBitsetSize());
+
+  std::vector<uint64_t> hashes;
+  hashes.reserve(test_case.inserted_count);
+  for (int32_t i = 0; i < test_case.inserted_count; ++i) {
+    const auto hash = bloom_filter->Hash(i);
+    hashes.push_back(hash);
+    bloom_filter->InsertHash(hash);
+  }
+
+  auto sink = CreateOutputStream();
+  auto locations = bloom_filter_builder->WriteTo(sink.get());
+  ASSERT_EQ(locations.size(), 1);
+  ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+
+  const auto& location = locations.front().second;
+  ReaderProperties reader_properties;
+  ::arrow::io::BufferReader reader(
+      ::arrow::SliceBuffer(buffer, location.offset, location.length));
+  auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, 
&reader);
+
+  
EXPECT_EQ(BlockSplitBloomFilter::OptimalNumOfBytes(test_case.expected_bitset_ndv,
 kFpp),
+            filter.GetBitsetSize());
+  for (uint64_t hash : hashes) {
+    EXPECT_TRUE(filter.FindHash(hash));
+  }
+
+  int32_t false_positives = 0;
+  constexpr int32_t kNonInsertedCount = 10'000;
+  for (int32_t i = test_case.inserted_count;
+       i < test_case.inserted_count + kNonInsertedCount; ++i) {
+    false_positives += filter.FindHash(filter.Hash(i));
+  }
+  EXPECT_LT(static_cast<double>(false_positives) / kNonInsertedCount, kFpp);

Review Comment:
   Perhaps we can even check that we are close enough to this FPP, otherwise 
this may mean that we didn't fold enough? Something like:
   
   ```suggestion
     const auto sample_fpp = static_cast<double>(false_positives) / 
kNonInsertedCount;
     EXPECT_LT(sample_fpp, kFpp);
     // If the actual fpp, as computed on this sample, is significantly below 
kFpp / 2,
     // then we could have folded the bloom filter at least once more.
     EXPECT_GT(sample_fpp, kFpp / 2.1);
   ```
   



##########
cpp/src/parquet/bloom_filter.cc:
##########
@@ -345,9 +348,89 @@ void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* 
sink) const {
   PARQUET_THROW_NOT_OK(sink->Write(data_->data(), num_bytes_));
 }
 
+void BlockSplitBloomFilter::FoldToTargetFpp(double target_fpp) {
+  const auto num_bits = static_cast<int64_t>(num_bytes_) * 8;
+  const auto total_set_bits =
+      ::arrow::internal::CountSetBits(data_->data(), /*bit_offset=*/0, 
num_bits);
+  if (total_set_bits == 0) {
+    num_bytes_ = kMinimumBloomFilterBytes;
+    return;
+  }
+
+  const double avg_fill = static_cast<double>(total_set_bits) / num_bits;
+  const uint32_t num_folds = NumFoldsForTargetFpp(target_fpp, avg_fill);
+  if (num_folds > 0) {
+    Fold(num_folds);
+  }
+}
+
+uint32_t BlockSplitBloomFilter::NumFoldsForTargetFpp(double target_fpp,
+                                                     double avg_fill) const {
+  const uint32_t num_blocks = NumBlocks();
+  if (num_blocks < 2) {
+    return 0;
+  }
+  DCHECK_EQ(num_blocks & (num_blocks - 1), 0);

Review Comment:
   ```suggestion
     // Number of blocks is a power of two
     DCHECK_EQ(num_blocks & (num_blocks - 1), 0);
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to