mapleFU commented on code in PR #44252:
URL: https://github.com/apache/arrow/pull/44252#discussion_r1821976919


##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,169 @@ Result<std::shared_ptr<RecordBatch>> 
RecordBatch::ViewOrCopyTo(
   return Make(schema_, num_rows(), std::move(copied_columns));
 }
 
+namespace {
+struct EnumeratedStatistics {
+  int nth_statistics = 0;
+  bool start_new_column = false;
+  std::optional<int32_t> nth_column = std::nullopt;
+  const char* key = nullptr;
+  std::shared_ptr<DataType> type = nullptr;
+  ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+    std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics 
on_statistics) {
+  EnumeratedStatistics statistics;
+  statistics.nth_statistics = 0;
+  statistics.start_new_column = true;
+  statistics.nth_column = std::nullopt;
+  statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;
+  statistics.type = int64();
+  statistics.value = record_batch.num_rows();
+  RETURN_NOT_OK(on_statistics(statistics));

Review Comment:
   should we mark `start_new_column=false`?



##########
cpp/src/arrow/c/abi.h:
##########
@@ -80,6 +80,24 @@ struct ArrowArray {
   void* private_data;
 };
 
+#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT 
"ARROW:average_byte_width:exact"

Review Comment:
   Don't know constexpr std::string_view is better or this is better



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,169 @@ Result<std::shared_ptr<RecordBatch>> 
RecordBatch::ViewOrCopyTo(
   return Make(schema_, num_rows(), std::move(copied_columns));
 }
 
+namespace {
+struct EnumeratedStatistics {
+  int nth_statistics = 0;
+  bool start_new_column = false;
+  std::optional<int32_t> nth_column = std::nullopt;
+  const char* key = nullptr;
+  std::shared_ptr<DataType> type = nullptr;
+  ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+    std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics 
on_statistics) {
+  EnumeratedStatistics statistics;
+  statistics.nth_statistics = 0;
+  statistics.start_new_column = true;
+  statistics.nth_column = std::nullopt;
+  statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;

Review Comment:
   So RowCount is also handled as a stats 🤔?



##########
cpp/src/arrow/array/statistics.h:
##########
@@ -34,6 +35,22 @@ namespace arrow {
 struct ARROW_EXPORT ArrayStatistics {
   using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
 
+  static const std::shared_ptr<DataType>& ValueToArrowType(
+      const std::optional<ValueType>& value) {
+    if (!value.has_value()) {
+      return null();
+    }
+
+    struct Visitor {
+      const std::shared_ptr<DataType>& operator()(const bool&) { return 
boolean(); }
+      const std::shared_ptr<DataType>& operator()(const int64_t&) { return 
int64(); }
+      const std::shared_ptr<DataType>& operator()(const uint64_t&) { return 
uint64(); }
+      const std::shared_ptr<DataType>& operator()(const double&) { return 
float64(); }
+      const std::shared_ptr<DataType>& operator()(const std::string&) { return 
utf8(); }

Review Comment:
   I may forgot a bit but we don't distinct "bytes" and "utf8" in stats?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to