emkornfield commented on code in PR #40594:
URL: https://github.com/apache/arrow/pull/40594#discussion_r1588525218


##########
cpp/src/parquet/size_statistics.cc:
##########
@@ -0,0 +1,266 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliancec
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "parquet/size_statistics.h"
+
+#include <algorithm>
+
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/visit_data_inline.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/thrift_internal.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class SizeStatistics::SizeStatisticsImpl {
+ public:
+  SizeStatisticsImpl() = default;
+
+  SizeStatisticsImpl(const format::SizeStatistics* size_stats,
+                     const ColumnDescriptor* descr)
+      : rep_level_histogram_(size_stats->repetition_level_histogram),
+        def_level_histogram_(size_stats->definition_level_histogram) {
+    if (descr->physical_type() == Type::BYTE_ARRAY &&
+        size_stats->__isset.unencoded_byte_array_data_bytes) {
+      unencoded_byte_array_data_bytes_ = 
size_stats->unencoded_byte_array_data_bytes;
+    }
+  }
+
+  const std::vector<int64_t>& repetition_level_histogram() const {
+    return rep_level_histogram_;
+  }
+
+  const std::vector<int64_t>& definition_level_histogram() const {
+    return def_level_histogram_;
+  }
+
+  std::optional<int64_t> unencoded_byte_array_data_bytes() const {
+    return unencoded_byte_array_data_bytes_;
+  }
+
+  void Merge(const SizeStatistics& other) {
+    if (rep_level_histogram_.size() != 
other.repetition_level_histogram().size() ||
+        def_level_histogram_.size() != 
other.definition_level_histogram().size() ||
+        unencoded_byte_array_data_bytes_.has_value() !=
+            other.unencoded_byte_array_data_bytes().has_value()) {
+      throw ParquetException("Cannot merge incompatible SizeStatistics");
+    }
+
+    std::transform(rep_level_histogram_.begin(), rep_level_histogram_.end(),
+                   other.repetition_level_histogram().begin(),
+                   rep_level_histogram_.begin(), std::plus<>());
+
+    std::transform(def_level_histogram_.begin(), def_level_histogram_.end(),
+                   other.definition_level_histogram().begin(),
+                   def_level_histogram_.begin(), std::plus<>());
+    if (unencoded_byte_array_data_bytes_.has_value()) {
+      unencoded_byte_array_data_bytes_ = 
unencoded_byte_array_data_bytes_.value() +
+                                         
other.unencoded_byte_array_data_bytes().value();
+    }
+  }
+
+ private:
+  friend class SizeStatisticsBuilder;
+  std::vector<int64_t> rep_level_histogram_;
+  std::vector<int64_t> def_level_histogram_;
+  std::optional<int64_t> unencoded_byte_array_data_bytes_;
+};
+
+const std::vector<int64_t>& SizeStatistics::repetition_level_histogram() const 
{
+  return impl_->repetition_level_histogram();
+}
+
+const std::vector<int64_t>& SizeStatistics::definition_level_histogram() const 
{
+  return impl_->definition_level_histogram();
+}
+
+std::optional<int64_t> SizeStatistics::unencoded_byte_array_data_bytes() const 
{
+  return impl_->unencoded_byte_array_data_bytes();
+}
+
+void SizeStatistics::Merge(const SizeStatistics& other) { return 
impl_->Merge(other); }
+
+SizeStatistics::SizeStatistics(const void* size_statistics, const 
ColumnDescriptor* descr)
+    : impl_(std::make_unique<SizeStatisticsImpl>(
+          reinterpret_cast<const format::SizeStatistics*>(size_statistics), 
descr)) {}
+
+SizeStatistics::SizeStatistics() : 
impl_(std::make_unique<SizeStatisticsImpl>()) {}
+
+SizeStatistics::~SizeStatistics() = default;
+
+std::unique_ptr<SizeStatistics> SizeStatistics::Make(const void* 
size_statistics,
+                                                     const ColumnDescriptor* 
descr) {
+  return std::unique_ptr<SizeStatistics>(new SizeStatistics(size_statistics, 
descr));
+}
+
+class SizeStatisticsBuilder::SizeStatisticsBuilderImpl {
+ public:
+  SizeStatisticsBuilderImpl(const ColumnDescriptor* descr)
+      : rep_level_histogram_(descr->max_repetition_level() + 1, 0),
+        def_level_histogram_(descr->max_definition_level() + 1, 0) {
+    if (descr->physical_type() == Type::BYTE_ARRAY) {
+      unencoded_byte_array_data_bytes_ = 0;
+    }
+  }
+
+  void WriteRepetitionLevels(int64_t num_levels, const int16_t* rep_levels) {
+    for (int64_t i = 0; i < num_levels; ++i) {
+      ARROW_DCHECK_LT(rep_levels[i], 
static_cast<int16_t>(rep_level_histogram_.size()));
+      rep_level_histogram_[rep_levels[i]]++;
+    }
+  }
+
+  void WriteDefinitionLevels(int64_t num_levels, const int16_t* def_levels) {
+    for (int64_t i = 0; i < num_levels; ++i) {
+      ARROW_DCHECK_LT(def_levels[i], 
static_cast<int16_t>(def_level_histogram_.size()));
+      def_level_histogram_[def_levels[i]]++;
+    }
+  }
+
+  void WriteRepetitionLevel(int64_t num_levels, int16_t rep_level) {
+    ARROW_DCHECK_LT(rep_level, 
static_cast<int16_t>(rep_level_histogram_.size()));
+    rep_level_histogram_[rep_level] += num_levels;
+  }
+
+  void WriteDefinitionLevel(int64_t num_levels, int16_t def_level) {
+    ARROW_DCHECK_LT(def_level, 
static_cast<int16_t>(def_level_histogram_.size()));
+    def_level_histogram_[def_level] += num_levels;
+  }
+
+  void WriteValuesSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                         int64_t valid_bits_offset, int64_t num_spaced_values) 
{
+    int64_t total_bytes = 0;
+    ::arrow::internal::VisitSetBitRunsVoid(valid_bits, valid_bits_offset,
+                                           num_spaced_values,
+                                           [&](int64_t pos, int64_t length) {
+                                             for (int64_t i = 0; i < length; 
i++) {
+                                               // Don't bother to check 
unlikely overflow.
+                                               total_bytes += values[i + 
pos].len;
+                                             }
+                                           });
+    IncrementUnencodedByteArrayDataBytes(total_bytes);
+  }
+
+  void WriteValues(const ByteArray* values, int64_t num_values) {
+    int64_t total_bytes = 0;
+    std::for_each(values, values + num_values,
+                  [&](const ByteArray& value) { total_bytes += values->len; });
+    IncrementUnencodedByteArrayDataBytes(total_bytes);
+  }
+
+  void WriteValues(const ::arrow::Array& values) {
+    int64_t total_bytes = 0;
+    const auto valid_func = [&](ByteArray val) { total_bytes += val.len; };
+    const auto null_func = [&]() {};
+
+    if (::arrow::is_binary_like(values.type_id())) {
+      ::arrow::VisitArraySpanInline<::arrow::BinaryType>(
+          *values.data(), std::move(valid_func), std::move(null_func));
+    } else if (::arrow::is_large_binary_like(values.type_id())) {

Review Comment:
   i might have read to quickly above but how does this account for 
`dictionary<string>` arrays?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to