mapleFU commented on code in PR #44252:
URL: https://github.com/apache/arrow/pull/44252#discussion_r1821976919
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,169 @@ Result<std::shared_ptr<RecordBatch>>
RecordBatch::ViewOrCopyTo(
return Make(schema_, num_rows(), std::move(copied_columns));
}
+namespace {
+struct EnumeratedStatistics {
+ int nth_statistics = 0;
+ bool start_new_column = false;
+ std::optional<int32_t> nth_column = std::nullopt;
+ const char* key = nullptr;
+ std::shared_ptr<DataType> type = nullptr;
+ ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+ std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics
on_statistics) {
+ EnumeratedStatistics statistics;
+ statistics.nth_statistics = 0;
+ statistics.start_new_column = true;
+ statistics.nth_column = std::nullopt;
+ statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;
+ statistics.type = int64();
+ statistics.value = record_batch.num_rows();
+ RETURN_NOT_OK(on_statistics(statistics));
Review Comment:
should we mark `start_new_column=false`?
##########
cpp/src/arrow/c/abi.h:
##########
@@ -80,6 +80,24 @@ struct ArrowArray {
void* private_data;
};
+# define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT
"ARROW:average_byte_width:exact"
Review Comment:
Don't know constexpr std::string_view is better or this is better
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,169 @@ Result<std::shared_ptr<RecordBatch>>
RecordBatch::ViewOrCopyTo(
return Make(schema_, num_rows(), std::move(copied_columns));
}
+namespace {
+struct EnumeratedStatistics {
+ int nth_statistics = 0;
+ bool start_new_column = false;
+ std::optional<int32_t> nth_column = std::nullopt;
+ const char* key = nullptr;
+ std::shared_ptr<DataType> type = nullptr;
+ ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+ std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics
on_statistics) {
+ EnumeratedStatistics statistics;
+ statistics.nth_statistics = 0;
+ statistics.start_new_column = true;
+ statistics.nth_column = std::nullopt;
+ statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;
Review Comment:
So RowCount is also handled as a stats 🤔?
##########
cpp/src/arrow/array/statistics.h:
##########
@@ -34,6 +35,22 @@ namespace arrow {
struct ARROW_EXPORT ArrayStatistics {
using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
+ static const std::shared_ptr<DataType>& ValueToArrowType(
+ const std::optional<ValueType>& value) {
+ if (!value.has_value()) {
+ return null();
+ }
+
+ struct Visitor {
+ const std::shared_ptr<DataType>& operator()(const bool&) { return
boolean(); }
+ const std::shared_ptr<DataType>& operator()(const int64_t&) { return
int64(); }
+ const std::shared_ptr<DataType>& operator()(const uint64_t&) { return
uint64(); }
+ const std::shared_ptr<DataType>& operator()(const double&) { return
float64(); }
+ const std::shared_ptr<DataType>& operator()(const std::string&) { return
utf8(); }
Review Comment:
I may forgot a bit but we don't distinct "bytes" and "utf8" in stats?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]