mapleFU commented on code in PR #40594:
URL: https://github.com/apache/arrow/pull/40594#discussion_r1704941135


##########
cpp/src/parquet/size_statistics.cc:
##########
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliancec
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/size_statistics.h"
+
+#include <algorithm>
+
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/visit_data_inline.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/thrift_internal.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class SizeStatistics::Impl {
+ public:
+  Impl() = default;
+
+  Impl(const format::SizeStatistics* size_stats, const ColumnDescriptor* descr)
+      : rep_level_histogram_(size_stats->repetition_level_histogram),
+        def_level_histogram_(size_stats->definition_level_histogram) {
+    if (descr->physical_type() == Type::BYTE_ARRAY &&
+        size_stats->__isset.unencoded_byte_array_data_bytes) {
+      unencoded_byte_array_data_bytes_ = 
size_stats->unencoded_byte_array_data_bytes;
+    }
+  }
+
+  const std::vector<int64_t>& repetition_level_histogram() const {
+    return rep_level_histogram_;
+  }
+
+  const std::vector<int64_t>& definition_level_histogram() const {
+    return def_level_histogram_;
+  }
+
+  std::optional<int64_t> unencoded_byte_array_data_bytes() const {
+    return unencoded_byte_array_data_bytes_;
+  }
+
+  void Merge(const SizeStatistics& other) {
+    if (rep_level_histogram_.size() != 
other.repetition_level_histogram().size() ||
+        def_level_histogram_.size() != 
other.definition_level_histogram().size() ||
+        unencoded_byte_array_data_bytes_.has_value() !=
+            other.unencoded_byte_array_data_bytes().has_value()) {
+      throw ParquetException("Cannot merge incompatible SizeStatistics");
+    }
+
+    std::transform(rep_level_histogram_.begin(), rep_level_histogram_.end(),
+                   other.repetition_level_histogram().begin(),
+                   rep_level_histogram_.begin(), std::plus<>());
+
+    std::transform(def_level_histogram_.begin(), def_level_histogram_.end(),
+                   other.definition_level_histogram().begin(),
+                   def_level_histogram_.begin(), std::plus<>());
+    if (unencoded_byte_array_data_bytes_.has_value()) {
+      unencoded_byte_array_data_bytes_ = 
unencoded_byte_array_data_bytes_.value() +
+                                         
other.unencoded_byte_array_data_bytes().value();
+    }
+  }
+
+ private:
+  friend class SizeStatisticsBuilder;
+  std::vector<int64_t> rep_level_histogram_;
+  std::vector<int64_t> def_level_histogram_;
+  std::optional<int64_t> unencoded_byte_array_data_bytes_;
+};
+
+const std::vector<int64_t>& SizeStatistics::repetition_level_histogram() const 
{
+  return impl_->repetition_level_histogram();
+}
+
+const std::vector<int64_t>& SizeStatistics::definition_level_histogram() const 
{
+  return impl_->definition_level_histogram();
+}
+
+std::optional<int64_t> SizeStatistics::unencoded_byte_array_data_bytes() const 
{
+  return impl_->unencoded_byte_array_data_bytes();
+}
+
+void SizeStatistics::Merge(const SizeStatistics& other) { return 
impl_->Merge(other); }
+
+SizeStatistics::SizeStatistics(const void* size_statistics, const 
ColumnDescriptor* descr)
+    : impl_(std::make_unique<Impl>(
+          reinterpret_cast<const format::SizeStatistics*>(size_statistics), 
descr)) {}
+
+SizeStatistics::SizeStatistics() : impl_(std::make_unique<Impl>()) {}
+
+SizeStatistics::~SizeStatistics() = default;
+
+std::unique_ptr<SizeStatistics> SizeStatistics::Make(const void* 
size_statistics,
+                                                     const ColumnDescriptor* 
descr) {
+  return std::unique_ptr<SizeStatistics>(new SizeStatistics(size_statistics, 
descr));
+}
+
+class SizeStatisticsBuilder::Impl {
+ public:
+  explicit Impl(const ColumnDescriptor* descr)
+      : rep_level_histogram_(descr->max_repetition_level() + 1, 0),
+        def_level_histogram_(descr->max_definition_level() + 1, 0) {
+    if (descr->physical_type() == Type::BYTE_ARRAY) {
+      unencoded_byte_array_data_bytes_ = 0;
+    }
+  }
+
+  void WriteRepetitionLevels(::arrow::util::span<const int16_t> rep_levels) {
+    for (int16_t rep_level : rep_levels) {
+      ARROW_DCHECK_LT(rep_level, 
static_cast<int16_t>(rep_level_histogram_.size()));
+      rep_level_histogram_[rep_level]++;
+    }
+  }
+
+  void WriteDefinitionLevels(::arrow::util::span<const int16_t> def_levels) {
+    for (int16_t def_level : def_levels) {
+      ARROW_DCHECK_LT(def_level, 
static_cast<int16_t>(def_level_histogram_.size()));
+      def_level_histogram_[def_level]++;

Review Comment:
   ```suggestion
         ++def_level_histogram_[def_level];
   ```
   
   nit



##########
cpp/src/parquet/column_page.h:
##########
@@ -69,27 +70,32 @@ class DataPage : public Page {
   /// Currently it is only present from data pages created by ColumnWriter in 
order
   /// to collect page index.
   std::optional<int64_t> first_row_index() const { return first_row_index_; }
+  const std::shared_ptr<SizeStatistics>& size_statistics() const {
+    return size_statistics_;
+  }
 
   virtual ~DataPage() = default;
 
  protected:
   DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t 
num_values,
            Encoding::type encoding, int64_t uncompressed_size,
-           EncodedStatistics statistics = EncodedStatistics(),
-           std::optional<int64_t> first_row_index = std::nullopt)
+           EncodedStatistics statistics, std::optional<int64_t> 
first_row_index,
+           std::shared_ptr<SizeStatistics> size_statistics)
       : Page(buffer, type),
         num_values_(num_values),
         encoding_(encoding),
         uncompressed_size_(uncompressed_size),
         statistics_(std::move(statistics)),
-        first_row_index_(std::move(first_row_index)) {}
+        first_row_index_(std::move(first_row_index)),
+        size_statistics_(std::move(size_statistics)) {}
 
   int32_t num_values_;
   Encoding::type encoding_;
   int64_t uncompressed_size_;
   EncodedStatistics statistics_;
   /// Row ordinal within the row group to the first row in the data page.
   std::optional<int64_t> first_row_index_;
+  std::shared_ptr<SizeStatistics> size_statistics_;

Review Comment:
   Can we add comment for this could/could not be nullable?



##########
cpp/src/parquet/column_writer.cc:
##########
@@ -789,11 +793,13 @@ class ColumnWriterImpl {
   // Serializes Dictionary Page if enabled
   virtual void WriteDictionaryPage() = 0;
 
+  using StatisticsPair = std::pair<EncodedStatistics, 
std::shared_ptr<SizeStatistics>>;

Review Comment:
   Same issue here



##########
cpp/src/parquet/size_statistics.cc:
##########
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliancec
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/size_statistics.h"
+
+#include <algorithm>
+
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/visit_data_inline.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/thrift_internal.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class SizeStatistics::Impl {
+ public:
+  Impl() = default;
+
+  Impl(const format::SizeStatistics* size_stats, const ColumnDescriptor* descr)
+      : rep_level_histogram_(size_stats->repetition_level_histogram),
+        def_level_histogram_(size_stats->definition_level_histogram) {
+    if (descr->physical_type() == Type::BYTE_ARRAY &&
+        size_stats->__isset.unencoded_byte_array_data_bytes) {
+      unencoded_byte_array_data_bytes_ = 
size_stats->unencoded_byte_array_data_bytes;
+    }
+  }
+
+  const std::vector<int64_t>& repetition_level_histogram() const {
+    return rep_level_histogram_;
+  }
+
+  const std::vector<int64_t>& definition_level_histogram() const {
+    return def_level_histogram_;
+  }
+
+  std::optional<int64_t> unencoded_byte_array_data_bytes() const {
+    return unencoded_byte_array_data_bytes_;
+  }
+
+  void Merge(const SizeStatistics& other) {
+    if (rep_level_histogram_.size() != 
other.repetition_level_histogram().size() ||
+        def_level_histogram_.size() != 
other.definition_level_histogram().size() ||
+        unencoded_byte_array_data_bytes_.has_value() !=
+            other.unencoded_byte_array_data_bytes().has_value()) {
+      throw ParquetException("Cannot merge incompatible SizeStatistics");
+    }
+
+    std::transform(rep_level_histogram_.begin(), rep_level_histogram_.end(),
+                   other.repetition_level_histogram().begin(),
+                   rep_level_histogram_.begin(), std::plus<>());
+
+    std::transform(def_level_histogram_.begin(), def_level_histogram_.end(),
+                   other.definition_level_histogram().begin(),
+                   def_level_histogram_.begin(), std::plus<>());
+    if (unencoded_byte_array_data_bytes_.has_value()) {
+      unencoded_byte_array_data_bytes_ = 
unencoded_byte_array_data_bytes_.value() +
+                                         
other.unencoded_byte_array_data_bytes().value();
+    }
+  }
+
+ private:
+  friend class SizeStatisticsBuilder;
+  std::vector<int64_t> rep_level_histogram_;
+  std::vector<int64_t> def_level_histogram_;
+  std::optional<int64_t> unencoded_byte_array_data_bytes_;
+};
+
+const std::vector<int64_t>& SizeStatistics::repetition_level_histogram() const 
{
+  return impl_->repetition_level_histogram();
+}
+
+const std::vector<int64_t>& SizeStatistics::definition_level_histogram() const 
{
+  return impl_->definition_level_histogram();
+}
+
+std::optional<int64_t> SizeStatistics::unencoded_byte_array_data_bytes() const 
{
+  return impl_->unencoded_byte_array_data_bytes();
+}
+
+void SizeStatistics::Merge(const SizeStatistics& other) { return 
impl_->Merge(other); }
+
+SizeStatistics::SizeStatistics(const void* size_statistics, const 
ColumnDescriptor* descr)
+    : impl_(std::make_unique<Impl>(
+          reinterpret_cast<const format::SizeStatistics*>(size_statistics), 
descr)) {}
+
+SizeStatistics::SizeStatistics() : impl_(std::make_unique<Impl>()) {}
+
+SizeStatistics::~SizeStatistics() = default;
+
+std::unique_ptr<SizeStatistics> SizeStatistics::Make(const void* 
size_statistics,
+                                                     const ColumnDescriptor* 
descr) {
+  return std::unique_ptr<SizeStatistics>(new SizeStatistics(size_statistics, 
descr));
+}
+
+class SizeStatisticsBuilder::Impl {
+ public:
+  explicit Impl(const ColumnDescriptor* descr)
+      : rep_level_histogram_(descr->max_repetition_level() + 1, 0),
+        def_level_histogram_(descr->max_definition_level() + 1, 0) {
+    if (descr->physical_type() == Type::BYTE_ARRAY) {
+      unencoded_byte_array_data_bytes_ = 0;
+    }
+  }
+
+  void WriteRepetitionLevels(::arrow::util::span<const int16_t> rep_levels) {
+    for (int16_t rep_level : rep_levels) {
+      ARROW_DCHECK_LT(rep_level, 
static_cast<int16_t>(rep_level_histogram_.size()));
+      rep_level_histogram_[rep_level]++;

Review Comment:
   ```suggestion
         ++rep_level_histogram_[rep_level];
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to