kou commented on code in PR #44252:
URL: https://github.com/apache/arrow/pull/44252#discussion_r1822002635


##########
cpp/src/arrow/array/statistics.h:
##########
@@ -34,6 +35,22 @@ namespace arrow {
 struct ARROW_EXPORT ArrayStatistics {
   using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
 
+  static const std::shared_ptr<DataType>& ValueToArrowType(
+      const std::optional<ValueType>& value) {
+    if (!value.has_value()) {
+      return null();
+    }
+
+    struct Visitor {
+      const std::shared_ptr<DataType>& operator()(const bool&) { return 
boolean(); }
+      const std::shared_ptr<DataType>& operator()(const int64_t&) { return 
int64(); }
+      const std::shared_ptr<DataType>& operator()(const uint64_t&) { return 
uint64(); }
+      const std::shared_ptr<DataType>& operator()(const double&) { return 
float64(); }
+      const std::shared_ptr<DataType>& operator()(const std::string&) { return 
utf8(); }

Review Comment:
   Ah, we didn't discuss it...
   Let's discuss it in #44579.
   
   We can assume "utf8" here for now.



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,169 @@ Result<std::shared_ptr<RecordBatch>> 
RecordBatch::ViewOrCopyTo(
   return Make(schema_, num_rows(), std::move(copied_columns));
 }
 
+namespace {
+struct EnumeratedStatistics {
+  int nth_statistics = 0;
+  bool start_new_column = false;
+  std::optional<int32_t> nth_column = std::nullopt;
+  const char* key = nullptr;
+  std::shared_ptr<DataType> type = nullptr;
+  ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+    std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics 
on_statistics) {
+  EnumeratedStatistics statistics;
+  statistics.nth_statistics = 0;
+  statistics.start_new_column = true;
+  statistics.nth_column = std::nullopt;
+  statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;
+  statistics.type = int64();
+  statistics.value = record_batch.num_rows();
+  RETURN_NOT_OK(on_statistics(statistics));

Review Comment:
   Ah, it's safer when we add a new record batch/table level statistics. I'll 
do it.
   



##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,169 @@ Result<std::shared_ptr<RecordBatch>> 
RecordBatch::ViewOrCopyTo(
   return Make(schema_, num_rows(), std::move(copied_columns));
 }
 
+namespace {
+struct EnumeratedStatistics {
+  int nth_statistics = 0;
+  bool start_new_column = false;
+  std::optional<int32_t> nth_column = std::nullopt;
+  const char* key = nullptr;
+  std::shared_ptr<DataType> type = nullptr;
+  ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+    std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics 
on_statistics) {
+  EnumeratedStatistics statistics;
+  statistics.nth_statistics = 0;
+  statistics.start_new_column = true;
+  statistics.nth_column = std::nullopt;
+  statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;

Review Comment:
   Statistics array will be passed to consumer before consumer receives a 
record batch.
   So this may be useful for consumer.
   
   But DuckDB doesn't have row count in its `BaseStatistics`...: 
https://github.com/duckdb/duckdb/blob/670cd341249e266de384e0341f200f4864b41b27/src/include/duckdb/storage/statistics/base_statistics.hpp#L38-L146
   This may not be useful...



##########
cpp/src/arrow/c/abi.h:
##########
@@ -80,6 +80,24 @@ struct ArrowArray {
   void* private_data;
 };
 
+#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT 
"ARROW:average_byte_width:exact"

Review Comment:
   We can't use `constexpr` because this header may be used by C programs.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to