kou commented on code in PR #44252:
URL: https://github.com/apache/arrow/pull/44252#discussion_r1828427406
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,220 @@ Result<std::shared_ptr<RecordBatch>>
RecordBatch::ViewOrCopyTo(
return Make(schema_, num_rows(), std::move(copied_columns));
}
+namespace {
+struct EnumeratedStatistics {
+ int nth_statistics = 0;
+ bool start_new_column = false;
+ std::optional<int32_t> nth_column = std::nullopt;
+ const char* key = nullptr;
+ std::shared_ptr<DataType> type = nullptr;
+ ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+ std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics
on_statistics) {
+ EnumeratedStatistics statistics;
+ statistics.nth_statistics = 0;
+ statistics.start_new_column = true;
+ statistics.nth_column = std::nullopt;
+
+ statistics.key = ARROW_STATISTICS_KEY_ROW_COUNT_EXACT;
+ statistics.type = int64();
+ statistics.value = record_batch.num_rows();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+
+ const auto num_fields = record_batch.schema()->num_fields();
+ for (int nth_column = 0; nth_column < num_fields; ++nth_column) {
+ auto column_statistics = record_batch.column(nth_column)->statistics();
+ if (!column_statistics) {
+ continue;
+ }
+
+ statistics.start_new_column = true;
+ statistics.nth_column = nth_column;
+ if (column_statistics->null_count.has_value()) {
+ statistics.nth_statistics++;
+ statistics.key = ARROW_STATISTICS_KEY_NULL_COUNT_EXACT;
+ statistics.type = int64();
+ statistics.value = column_statistics->null_count.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+
+ if (column_statistics->distinct_count.has_value()) {
+ statistics.nth_statistics++;
+ statistics.key = ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT;
+ statistics.type = int64();
+ statistics.value = column_statistics->distinct_count.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+
+ if (column_statistics->min.has_value()) {
+ statistics.nth_statistics++;
+ if (column_statistics->is_min_exact) {
+ statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_EXACT;
+ } else {
+ statistics.key = ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE;
+ }
+ statistics.type = column_statistics->MinArrowType();
+ statistics.value = column_statistics->min.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+
+ if (column_statistics->max.has_value()) {
+ statistics.nth_statistics++;
+ if (column_statistics->is_max_exact) {
+ statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_EXACT;
+ } else {
+ statistics.key = ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE;
+ }
+ statistics.type = column_statistics->MaxArrowType();
+ statistics.value = column_statistics->max.value();
+ RETURN_NOT_OK(on_statistics(statistics));
+ statistics.start_new_column = false;
+ }
+ }
+ return Status::OK();
+}
+} // namespace
+
+Result<std::shared_ptr<Array>> RecordBatch::MakeStatisticsArray(
+ MemoryPool* memory_pool) const {
+ // Statistics schema:
+ // struct<
+ // column: int32,
+ // statistics: map<
+ // key: dictionary<
+ // indices: int32,
+ // dictionary: utf8,
+ // >,
+ // items: dense_union<...all needed types...>,
+ // >
+ // >
+
+ // Statistics schema doesn't define static dense union type for
+ // values. Each statistics schema have a dense union type that has
+ // needled value types. The following block collects these types.
Review Comment:
Right.
If there are the same types, the first type is only used.
##########
cpp/src/arrow/record_batch.cc:
##########
@@ -465,6 +470,220 @@ Result<std::shared_ptr<RecordBatch>>
RecordBatch::ViewOrCopyTo(
return Make(schema_, num_rows(), std::move(copied_columns));
}
+namespace {
+struct EnumeratedStatistics {
+ int nth_statistics = 0;
+ bool start_new_column = false;
+ std::optional<int32_t> nth_column = std::nullopt;
+ const char* key = nullptr;
+ std::shared_ptr<DataType> type = nullptr;
+ ArrayStatistics::ValueType value = false;
+};
+using OnStatistics =
+ std::function<Status(const EnumeratedStatistics& enumerated_statistics)>;
+Status EnumerateStatistics(const RecordBatch& record_batch, OnStatistics
on_statistics) {
Review Comment:
Right.
I think that it's one of complexities.
So I sent https://lists.apache.org/thread/0c9jftkspvj7yw1lpo73s3vtp6vfjqv8
to the mailing list. But nobody agreed it. So this complexity will be
acceptable...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]