[GitHub] [arrow] mapleFU commented on a diff in pull request #37400: GH-34785: [C++][Parquet] Parquet Bloom Filter Implement

via GitHub Sat, 26 Aug 2023 08:20:48 -0700


mapleFU commented on code in PR #37400:
URL: https://github.com/apache/arrow/pull/37400#discussion_r1306470638



##########
cpp/src/parquet/bloom_filter.h:
##########
@@ -167,6 +167,12 @@ class PARQUET_EXPORT BloomFilter {
 
   virtual ~BloomFilter() = default;
 
+  // Variant of const pointer argument to facilitate template

Review Comment:
   @emkornfield I think pointer is ok here, would you think 
   
   ```c++
     virtual void Hash(const FLBA& value, uint32_t type_len) const;
     virtual void Hash(const ByteArray& value, uint32_t type_len) const;
     virtual void Hash(const Int96& value, uint32_t type_len) const;
   ```
   
   The code above is better?
   



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -2319,12 +2343,113 @@ Status 
TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
   return Status::OK();
 }
 
+template <typename DType>
+void TypedColumnWriterImpl<DType>::UpdateBloomFilter(const T* values,
+                                                     int64_t num_values) {
+  if (bloom_filter_) {
+    // TODO(mwish): Would it allocate too much memory? Would an 
std::array<uint64_t, 64>
+    //  better here?
+    std::vector<uint64_t> hashes(num_values);

Review Comment:
   Would I use an `std::array` to avoid allocating temporary stack memory?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -1708,7 +1728,7 @@ Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
                       AddIfNotNull(rep_levels, offset));
     std::shared_ptr<Array> writeable_indices =
         indices->Slice(value_offset, batch_num_spaced_values);
-    if (page_statistics_) {
+    if (page_statistics_ || bloom_filter_) {

Review Comment:
   The part of code is for dictionary. Previously, `page_statistics` will call 
`update_stats` to get dictionary items used in dictionary, and use them for 
"min-max".
   
   Here I reuse the code the part, and making it update the bloom-filter using 
`referenced_dictionary`.



##########
cpp/src/parquet/bloom_filter_parquet_test.cc:
##########
@@ -69,4 +71,36 @@ TEST(BloomFilterReader, FileNotHaveBloomFilter) {
   ASSERT_EQ(nullptr, bloom_filter);
 }
 
+TEST(BloomFilterBuilderTest, Basic) {

Review Comment:
   Actually I'm not familiar with test writing, it would be great if someone 
tell me how should I test these...



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -2319,12 +2343,113 @@ Status 
TypedColumnWriterImpl<FLBAType>::WriteArrowDense(
   return Status::OK();
 }
 
+template <typename DType>
+void TypedColumnWriterImpl<DType>::UpdateBloomFilter(const T* values,
+                                                     int64_t num_values) {
+  if (bloom_filter_) {
+    // TODO(mwish): Would it allocate too much memory? Would an 
std::array<uint64_t, 64>
+    //  better here?
+    std::vector<uint64_t> hashes(num_values);
+    bloom_filter_->Hashes(values, static_cast<int>(num_values), hashes.data());
+    bloom_filter_->InsertHashes(hashes.data(), static_cast<int>(num_values));
+  }
+}
+
+template <>
+void TypedColumnWriterImpl<FLBAType>::UpdateBloomFilter(const FLBA* values,
+                                                        int64_t num_values) {
+  if (bloom_filter_) {
+    for (int64_t i = 0; i < num_values; ++i) {
+      bloom_filter_->InsertHash(bloom_filter_->Hash(values + i, 
descr_->type_length()));
+    }
+  }
+}
+
+template <>
+void TypedColumnWriterImpl<BooleanType>::UpdateBloomFilter(const bool*, 
int64_t) {
+  DCHECK(bloom_filter_ == nullptr);
+}
+
+template <typename DType>
+void TypedColumnWriterImpl<DType>::UpdateBloomFilterSpaced(const T* values,
+                                                           int64_t num_values,
+                                                           const uint8_t* 
valid_bits,
+                                                           int64_t 
valid_bits_offset) {
+  if (bloom_filter_) {
+    ::arrow::internal::VisitSetBitRunsVoid(
+        valid_bits, valid_bits_offset, num_values, [&](int64_t position, 
int64_t length) {
+          for (int64_t i = 0; i < length; i++) {
+            bloom_filter_->InsertHash(bloom_filter_->Hash(values + i + 
position));
+          }
+        });
+  }
+}
+
+template <>
+void TypedColumnWriterImpl<BooleanType>::UpdateBloomFilterSpaced(const bool*, 
int64_t,
+                                                                 const 
uint8_t*,
+                                                                 int64_t) {
+  DCHECK(bloom_filter_ == nullptr);
+}
+
+template <>
+void TypedColumnWriterImpl<FLBAType>::UpdateBloomFilterSpaced(const FLBA* 
values,
+                                                              int64_t 
num_values,
+                                                              const uint8_t* 
valid_bits,
+                                                              int64_t 
valid_bits_offset) {
+  if (bloom_filter_) {
+    ::arrow::internal::VisitSetBitRunsVoid(
+        valid_bits, valid_bits_offset, num_values, [&](int64_t position, 
int64_t length) {
+          for (int64_t i = 0; i < length; i++) {
+            bloom_filter_->InsertHash(
+                bloom_filter_->Hash(values + i + position, 
descr_->type_length()));
+          }
+        });
+  }
+}
+
+template <typename ArrayType>
+void UpdateBinaryBloomFilter(BloomFilter* bloom_filter, const ArrayType& 
array) {
+  PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename 
ArrayType::TypeClass>(
+      *array.data(),
+      [&](const std::string_view& view) {
+        const ByteArray value{view};
+        bloom_filter->InsertHash(bloom_filter->Hash(&value));
+        return Status::OK();
+      },
+      []() { return Status::OK(); }));
+}
+
+template <>
+void TypedColumnWriterImpl<ByteArrayType>::UpdateBloomFilterArray(

Review Comment:
   I've check the code here.
   1. dictionary is handled in `WriteDictionary`, so `UpdateBloomFilterArray` 
will only write non-dictionary items.
   2. `WriteArrowDense` would use `WriteSpace` or other. Only `ByteArray` type 
would use this function.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] mapleFU commented on a diff in pull request #37400: GH-34785: [C++][Parquet] Parquet Bloom Filter Implement

Reply via email to