This is an automated email from the ASF dual-hosted git repository.
leaves12138 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/paimon-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 6a658f6 feat: add simple stats core utilities (#68)
6a658f6 is described below
commit 6a658f6c46318d53a40ae5e27fe939858bb22640
Author: Yonghao Fang <[email protected]>
AuthorDate: Wed Jun 10 10:40:21 2026 +0800
feat: add simple stats core utilities (#68)
---
src/paimon/core/stats/simple_stats.cpp | 116 +++++
src/paimon/core/stats/simple_stats.h | 91 ++++
src/paimon/core/stats/simple_stats_collector.cpp | 213 +++++++++
src/paimon/core/stats/simple_stats_collector.h | 60 +++
.../core/stats/simple_stats_collector_test.cpp | 115 +++++
src/paimon/core/stats/simple_stats_converter.cpp | 299 +++++++++++++
src/paimon/core/stats/simple_stats_converter.h | 40 ++
src/paimon/core/stats/simple_stats_evolution.cpp | 318 ++++++++++++++
src/paimon/core/stats/simple_stats_evolution.h | 93 ++++
.../core/stats/simple_stats_evolution_test.cpp | 479 +++++++++++++++++++++
src/paimon/core/stats/simple_stats_evolutions.h | 55 +++
src/paimon/core/stats/simple_stats_test.cpp | 50 +++
12 files changed, 1929 insertions(+)
diff --git a/src/paimon/core/stats/simple_stats.cpp
b/src/paimon/core/stats/simple_stats.cpp
new file mode 100644
index 0000000..b8a5ee3
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats.cpp
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/core/stats/simple_stats.h"
+
+#include <utility>
+#include <vector>
+
+#include "arrow/api.h"
+#include "paimon/common/data/binary_row_writer.h"
+#include "paimon/common/data/binary_section.h"
+#include "paimon/common/data/internal_array.h"
+#include "paimon/common/data/internal_row.h"
+#include "paimon/common/utils/murmurhash_utils.h"
+#include "paimon/common/utils/serialization_utils.h"
+#include "paimon/macros.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/status.h"
+
+namespace paimon {
+class Bytes;
+
+const SimpleStats& SimpleStats::EmptyStats() {
+ static const SimpleStats kEmptyStats(
+ BinaryRow::EmptyRow(), BinaryRow::EmptyRow(),
+ BinaryArray::FromLongArray(std::vector<int64_t>(),
GetDefaultPool().get()));
+ return kEmptyStats;
+}
+
+BinaryRow SimpleStats::ToRow() const {
+ BinaryRow row(3);
+ std::shared_ptr<MemoryPool> pool = GetDefaultPool();
+ BinaryRowWriter writer(&row, 32 * 1024, pool.get());
+ auto min_value_bytes = SerializationUtils::SerializeBinaryRow(min_values_,
pool.get());
+ writer.WriteBinary(0, *min_value_bytes);
+ auto max_value_bytes = SerializationUtils::SerializeBinaryRow(max_values_,
pool.get());
+ writer.WriteBinary(1, *max_value_bytes);
+ writer.WriteArray(2, null_counts_);
+ writer.Complete();
+ return row;
+}
+
+Result<SimpleStats> SimpleStats::FromRow(const InternalRow* row, MemoryPool*
pool) {
+ if (PAIMON_UNLIKELY(row == nullptr)) {
+ return Status::Invalid("internal row is null pointer");
+ }
+ if (PAIMON_UNLIKELY(pool == nullptr)) {
+ return Status::Invalid("memory pool is null pointer");
+ }
+ std::shared_ptr<Bytes> min_value = row->GetBinary(0);
+ if (PAIMON_UNLIKELY(min_value == nullptr)) {
+ return Status::Invalid("get min value from internal row failed.");
+ }
+ std::shared_ptr<Bytes> max_value = row->GetBinary(1);
+ if (PAIMON_UNLIKELY(max_value == nullptr)) {
+ return Status::Invalid("get max value from internal row failed.");
+ }
+ std::shared_ptr<InternalArray> null_counts = row->GetArray(2);
+ if (PAIMON_UNLIKELY(null_counts == nullptr)) {
+ return Status::Invalid("get null counts from internal row failed.");
+ }
+ PAIMON_ASSIGN_OR_RAISE(BinaryRow min_values,
+
SerializationUtils::DeserializeBinaryRow(min_value));
+ PAIMON_ASSIGN_OR_RAISE(BinaryRow max_values,
+
SerializationUtils::DeserializeBinaryRow(max_value));
+ if (min_values.GetFieldCount() == 0 && max_values.GetFieldCount() == 0 &&
+ null_counts->Size() == 0) {
+ return SimpleStats::EmptyStats();
+ }
+ return SimpleStats(min_values, max_values,
BinaryArray::FromLongArray(null_counts.get(), pool));
+}
+
+int32_t SimpleStats::HashCode() const {
+ int32_t min_hash = min_values_.HashCode();
+ int32_t max_hash = max_values_.HashCode();
+ int32_t null_hash = null_counts_.HashCode();
+ int32_t hash =
MurmurHashUtils::HashUnsafeBytes(reinterpret_cast<void*>(&max_hash), 0,
+ sizeof(max_hash),
min_hash);
+ return
MurmurHashUtils::HashUnsafeBytes(reinterpret_cast<void*>(&null_hash), 0,
+ sizeof(null_hash), hash);
+}
+
+bool SimpleStats::operator==(const SimpleStats& other) const {
+ if (this == &other) {
+ return true;
+ }
+ return min_values_ == other.min_values_ && max_values_ ==
other.max_values_ &&
+ null_counts_ == other.null_counts_;
+}
+
+const std::shared_ptr<arrow::DataType>& SimpleStats::DataType() {
+ static std::shared_ptr<arrow::DataType> data_type = arrow::struct_(
+ {arrow::field("_MIN_VALUES", arrow::binary(), /*nullable=*/false),
+ arrow::field("_MAX_VALUES", arrow::binary(), /*nullable=*/false),
+ arrow::field("_NULL_COUNTS", arrow::list(arrow::field("item",
arrow::int64(), true)),
+ /*nullable=*/true)});
+ return data_type;
+}
+
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats.h
b/src/paimon/core/stats/simple_stats.h
new file mode 100644
index 0000000..5bb67dc
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats.h
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "arrow/api.h"
+#include "fmt/format.h"
+#include "paimon/common/data/binary_array.h"
+#include "paimon/common/data/binary_row.h"
+#include "paimon/common/data/binary_row_writer.h"
+#include "paimon/result.h"
+namespace arrow {
+class ArrayBuilder;
+} // namespace arrow
+
+namespace paimon {
+class InternalRow;
+class MemoryPool;
+
+/// The statistics for columns, supports the following stats.
+///
+/// <ul>
+/// <li>min_values: the minimum values of the columns
+/// <li>max_values: the maximum values of the columns
+/// <li>null_counts: the number of nulls of the columns
+/// </ul>
+///
+/// All statistics are stored in the form of a Binary, which can significantly
reduce its memory
+/// consumption, but the cost is that the column type needs to be known when
getting.
+class SimpleStats {
+ public:
+ SimpleStats(const BinaryRow& min_values, const BinaryRow& max_values,
+ const BinaryArray& null_counts)
+ : min_values_(min_values), max_values_(max_values),
null_counts_(null_counts) {}
+
+ /// Empty stats for 0 column number.
+ static const SimpleStats& EmptyStats();
+
+ const BinaryRow& MinValues() const {
+ return min_values_;
+ }
+
+ const BinaryRow& MaxValues() const {
+ return max_values_;
+ }
+
+ const BinaryArray& NullCounts() const {
+ return null_counts_;
+ }
+
+ BinaryRow ToRow() const;
+
+ static Result<SimpleStats> FromRow(const InternalRow* row, MemoryPool*
pool);
+
+ std::string ToString() const {
+ return fmt::format("SimpleStats@{:#x}",
static_cast<uint32_t>(HashCode()));
+ }
+
+ int32_t HashCode() const;
+
+ bool operator==(const SimpleStats& other) const;
+
+ static const std::shared_ptr<arrow::DataType>& DataType();
+
+ private:
+ BinaryRow min_values_;
+ BinaryRow max_values_;
+ BinaryArray null_counts_;
+};
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_collector.cpp
b/src/paimon/core/stats/simple_stats_collector.cpp
new file mode 100644
index 0000000..7cd0440
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_collector.cpp
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/core/stats/simple_stats_collector.h"
+
+#include <cassert>
+#include <cstdint>
+#include <optional>
+#include <string>
+
+#include "arrow/api.h"
+#include "fmt/format.h"
+#include "paimon/common/data/binary_row.h"
+#include "paimon/common/data/binary_string.h"
+#include "paimon/format/column_stats.h"
+
+namespace paimon {
+
+SimpleStatsCollector::SimpleStatsCollector(const
std::shared_ptr<arrow::Schema>& schema)
+ : schema_(schema) {
+ column_stats_.resize(schema_->num_fields());
+}
+
+Status SimpleStatsCollector::Collect(const BinaryRow& row) {
+ assert(schema_);
+ if (schema_->num_fields() != row.GetFieldCount()) {
+ return Status::Invalid(fmt::format(
+ "fields count {} in partition schema not equal to fields count {}
in partition",
+ schema_->num_fields(), row.GetFieldCount()));
+ }
+ for (int32_t i = 0; i < schema_->num_fields(); i++) {
+ const auto& field = schema_->field(i);
+ const auto& type = field->type()->id();
+ switch (type) {
+ case arrow::Type::BOOL: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateBooleanColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<BooleanColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetBoolean(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::INT8: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateTinyIntColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<TinyIntColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetByte(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::INT16: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateSmallIntColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<SmallIntColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetShort(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::INT32: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] =
+ ColumnStats::CreateIntColumnStats(std::nullopt,
std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<IntColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetInt(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::INT64: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateBigIntColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<BigIntColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetLong(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::FLOAT: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateFloatColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<FloatColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetFloat(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::DOUBLE: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateDoubleColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<DoubleColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetDouble(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::STRING:
+ case arrow::Type::BINARY: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateStringColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<StringColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetString(i).ToString());
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ case arrow::Type::DATE32: {
+ if (column_stats_[i] == nullptr) {
+ column_stats_[i] = ColumnStats::CreateDateColumnStats(
+ std::nullopt, std::nullopt, std::nullopt);
+ }
+ auto typed_stats =
dynamic_cast<DateColumnStats*>(column_stats_[i].get());
+ if (typed_stats == nullptr) {
+ assert(false);
+ return Status::Invalid("cast typed stats failed");
+ }
+ if (!row.IsNullAt(i)) {
+ typed_stats->Collect(row.GetDate(i));
+ } else {
+ typed_stats->Collect(std::nullopt);
+ }
+ break;
+ }
+ default:
+ return Status::NotImplemented(
+ fmt::format("Do not support arrow type {}",
static_cast<int32_t>(type)));
+ }
+ }
+ return Status::OK();
+}
+
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_collector.h
b/src/paimon/core/stats/simple_stats_collector.h
new file mode 100644
index 0000000..c8bd87b
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_collector.h
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/api.h"
+#include "paimon/format/column_stats.h"
+#include "paimon/result.h"
+#include "paimon/status.h"
+
+namespace arrow {
+class Schema;
+} // namespace arrow
+
+namespace paimon {
+
+class BinaryRow;
+class ColumnStats;
+
+class SimpleStatsCollector {
+ public:
+ explicit SimpleStatsCollector(const std::shared_ptr<arrow::Schema>&
schema);
+ Status Collect(const BinaryRow& row);
+ Result<std::vector<std::shared_ptr<ColumnStats>>> GetResult() const {
+ std::vector<std::shared_ptr<ColumnStats>> stats;
+ for (const auto& col_stats : column_stats_) {
+ if (col_stats == nullptr) {
+ return Status::Invalid("column stats is nullptr");
+ }
+ stats.push_back(col_stats);
+ }
+ return stats;
+ }
+
+ private:
+ std::shared_ptr<arrow::Schema> schema_;
+ std::vector<std::shared_ptr<ColumnStats>> column_stats_;
+};
+
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_collector_test.cpp
b/src/paimon/core/stats/simple_stats_collector_test.cpp
new file mode 100644
index 0000000..bac30b4
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_collector_test.cpp
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/core/stats/simple_stats_collector.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <variant>
+
+#include "arrow/api.h"
+#include "gtest/gtest.h"
+#include "paimon/common/data/binary_array.h"
+#include "paimon/common/data/binary_row.h"
+#include "paimon/common/data/data_define.h"
+#include "paimon/core/stats/simple_stats.h"
+#include "paimon/core/stats/simple_stats_converter.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/testing/utils/binary_row_generator.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace paimon::test {
+
+TEST(SimpleStatsCollectorTest, TestSimple) {
+ arrow::FieldVector fields = {
+ arrow::field("f0", arrow::boolean()), arrow::field("f1",
arrow::int8()),
+ arrow::field("f2", arrow::int16()), arrow::field("f3",
arrow::int32()),
+ arrow::field("f4", arrow::int64()), arrow::field("f5",
arrow::float32()),
+ arrow::field("f6", arrow::float64()), arrow::field("f7",
arrow::utf8()),
+ arrow::field("f8", arrow::date32()),
+ };
+
+ auto schema = arrow::schema(fields);
+ SimpleStatsCollector collector(schema);
+ auto pool = GetDefaultPool();
+ ASSERT_OK(collector.Collect(BinaryRowGenerator::GenerateRow(
+ {true, static_cast<int8_t>(1), static_cast<int16_t>(1),
static_cast<int32_t>(1),
+ static_cast<int64_t>(1), static_cast<float>(3.0),
static_cast<double>(3.0),
+ std::string("abc"), 2025},
+ pool.get())));
+ ASSERT_OK(collector.Collect(BinaryRowGenerator::GenerateRow(
+ {false, static_cast<int8_t>(2), static_cast<int16_t>(2),
static_cast<int32_t>(2),
+ static_cast<int64_t>(2), static_cast<float>(6.0),
static_cast<double>(6.0),
+ std::string("bcd"), 2026},
+ pool.get())));
+ ASSERT_OK_AND_ASSIGN(auto col_stats, collector.GetResult());
+ ASSERT_OK_AND_ASSIGN(SimpleStats stats,
SimpleStatsConverter::ToBinary(col_stats, pool.get()));
+
+ auto expected_stats = BinaryRowGenerator::GenerateStats(
+ {false, static_cast<int8_t>(1), static_cast<int16_t>(1),
static_cast<int32_t>(1),
+ static_cast<int64_t>(1), static_cast<float>(3.0),
static_cast<double>(3.0),
+ std::string("abc"), 2025},
+ {true, static_cast<int8_t>(2), static_cast<int16_t>(2),
static_cast<int32_t>(2),
+ static_cast<int64_t>(2), static_cast<float>(6.0),
static_cast<double>(6.0),
+ std::string("bcd"), 2026},
+ std::vector<int64_t>({0, 0, 0, 0, 0, 0, 0, 0, 0}),
GetDefaultPool().get());
+
+ ASSERT_EQ(stats, expected_stats);
+}
+
+TEST(SimpleStatsCollectorTest, TestNull) {
+ arrow::FieldVector fields = {
+ arrow::field("f0", arrow::boolean()), arrow::field("f1",
arrow::int8()),
+ arrow::field("f2", arrow::int16()), arrow::field("f3",
arrow::int32()),
+ arrow::field("f4", arrow::int64()), arrow::field("f5",
arrow::float32()),
+ arrow::field("f6", arrow::float64()), arrow::field("f7",
arrow::utf8()),
+ arrow::field("f8", arrow::date32()), arrow::field("key",
arrow::int32()),
+ };
+
+ auto schema = arrow::schema(fields);
+ SimpleStatsCollector collector(schema);
+ auto pool = GetDefaultPool();
+ ASSERT_OK(collector.Collect(
+ BinaryRowGenerator::GenerateRow({NullType(), NullType(), NullType(),
NullType(), NullType(),
+ NullType(), NullType(), NullType(),
NullType(), 100},
+ pool.get())));
+ ASSERT_OK_AND_ASSIGN(auto col_stats, collector.GetResult());
+ ASSERT_EQ(10, col_stats.size());
+ ASSERT_OK_AND_ASSIGN(SimpleStats stats,
SimpleStatsConverter::ToBinary(col_stats, pool.get()));
+
+ ASSERT_EQ(stats.MinValues(), stats.MaxValues());
+ for (size_t i = 0; i < 9; ++i) {
+ ASSERT_TRUE(stats.MinValues().IsNullAt(i));
+ }
+ ASSERT_EQ(stats.MinValues().GetInt(9), 100);
+ ASSERT_OK_AND_ASSIGN(std::vector<int64_t> expected,
stats.NullCounts().ToLongArray());
+ ASSERT_EQ(expected, std::vector<int64_t>({1l, 1l, 1l, 1l, 1l, 1l, 1l, 1l,
1l, 0l}));
+}
+
+TEST(SimpleStatsCollectorTest, TestInvalidPartition) {
+ arrow::FieldVector fields = {arrow::field("f0", arrow::boolean())};
+
+ auto schema = arrow::schema(fields);
+ SimpleStatsCollector collector(schema);
+ auto pool = GetDefaultPool();
+ ASSERT_NOK_WITH_MSG(collector.Collect(BinaryRow::EmptyRow()), "partition
schema not equal");
+}
+
+} // namespace paimon::test
diff --git a/src/paimon/core/stats/simple_stats_converter.cpp
b/src/paimon/core/stats/simple_stats_converter.cpp
new file mode 100644
index 0000000..d6a8075
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_converter.cpp
@@ -0,0 +1,299 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/core/stats/simple_stats_converter.h"
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "fmt/format.h"
+#include "paimon/common/data/binary_array.h"
+#include "paimon/common/data/binary_array_writer.h"
+#include "paimon/common/data/binary_row.h"
+#include "paimon/common/data/binary_row_writer.h"
+#include "paimon/common/data/binary_string.h"
+#include "paimon/core/stats/simple_stats.h"
+#include "paimon/data/decimal.h"
+#include "paimon/data/timestamp.h"
+#include "paimon/defs.h"
+#include "paimon/format/column_stats.h"
+#include "paimon/status.h"
+
+namespace paimon {
+class MemoryPool;
+
+Result<SimpleStats> SimpleStatsConverter::ToBinary(
+ const std::vector<std::shared_ptr<ColumnStats>>& stats_vec, MemoryPool*
pool) {
+ int32_t row_field_count = stats_vec.size();
+ BinaryRow min_values(row_field_count);
+ BinaryRowWriter min_writer(&min_values, 1024, pool);
+ BinaryRow max_values(row_field_count);
+ BinaryRowWriter max_writer(&max_values, 1024, pool);
+ BinaryArray null_counts;
+ BinaryArrayWriter null_writer(&null_counts, row_field_count,
sizeof(int64_t), pool);
+ for (int32_t i = 0; i < row_field_count; i++) {
+ const auto& stats = stats_vec[i];
+ auto type = stats->GetFieldType();
+ if (stats->NullCount()) {
+ null_writer.WriteLong(i, stats->NullCount().value());
+ } else {
+ null_writer.SetNullAt(i);
+ }
+ if (auto null_stats =
std::dynamic_pointer_cast<NestedColumnStats>(stats)) {
+ // nested type, e.g., List, Map, Struct
+ min_writer.SetNullAt(i);
+ max_writer.SetNullAt(i);
+ continue;
+ }
+ switch (type) {
+ case FieldType::BOOLEAN: {
+ auto typed_stats =
std::dynamic_pointer_cast<BooleanColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast BooleanColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteBoolean(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteBoolean(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::TINYINT: {
+ auto typed_stats =
std::dynamic_pointer_cast<TinyIntColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast TinyIntColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteByte(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteByte(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::SMALLINT: {
+ auto typed_stats =
std::dynamic_pointer_cast<SmallIntColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast SmallIntColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteShort(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteShort(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::INT: {
+ auto typed_stats =
std::dynamic_pointer_cast<IntColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast IntColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteInt(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteInt(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::BIGINT: {
+ auto typed_stats =
std::dynamic_pointer_cast<BigIntColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast LongColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteLong(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteLong(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::FLOAT: {
+ auto typed_stats =
std::dynamic_pointer_cast<FloatColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast FloatColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteFloat(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteFloat(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::DOUBLE: {
+ auto typed_stats =
std::dynamic_pointer_cast<DoubleColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast DoubleColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteDouble(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteDouble(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::STRING: {
+ auto typed_stats =
std::dynamic_pointer_cast<StringColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast StringColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteString(
+ i,
BinaryString::FromString(typed_stats->Min().value(), pool));
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteString(
+ i,
BinaryString::FromString(typed_stats->Max().value(), pool));
+ }
+ break;
+ }
+ case FieldType::DATE: {
+ auto typed_stats =
std::dynamic_pointer_cast<DateColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast DateColumnStats failed");
+ }
+ if (typed_stats->Min() == std::nullopt) {
+ min_writer.SetNullAt(i);
+ } else {
+ min_writer.WriteInt(i, typed_stats->Min().value());
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ max_writer.SetNullAt(i);
+ } else {
+ max_writer.WriteInt(i, typed_stats->Max().value());
+ }
+ break;
+ }
+ case FieldType::TIMESTAMP: {
+ auto typed_stats =
std::dynamic_pointer_cast<TimestampColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast TimestampColumnStats failed");
+ }
+ int32_t precision = typed_stats->GetPrecision();
+ if (typed_stats->Min() == std::nullopt) {
+ if (!Timestamp::IsCompact(precision)) {
+ min_writer.WriteTimestamp(i, std::nullopt, precision);
+ } else {
+ min_writer.SetNullAt(i);
+ }
+ } else {
+ min_writer.WriteTimestamp(i, typed_stats->Min().value(),
precision);
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ if (!Timestamp::IsCompact(precision)) {
+ max_writer.WriteTimestamp(i, std::nullopt, precision);
+ } else {
+ max_writer.SetNullAt(i);
+ }
+ } else {
+ max_writer.WriteTimestamp(i, typed_stats->Max().value(),
precision);
+ }
+ break;
+ }
+ case FieldType::DECIMAL: {
+ auto typed_stats =
std::dynamic_pointer_cast<DecimalColumnStats>(stats);
+ if (typed_stats == nullptr) {
+ return Status::Invalid("cast DecimalColumnStats failed");
+ }
+ auto precision = typed_stats->GetPrecision();
+ auto scale = typed_stats->GetScale();
+ if (typed_stats->Min() == std::nullopt) {
+ if (!Decimal::IsCompact(precision)) {
+ min_writer.WriteDecimal(i, std::nullopt, precision);
+ } else {
+ min_writer.SetNullAt(i);
+ }
+ } else {
+ auto min_decimal = typed_stats->Min().value();
+ if (min_decimal.Scale() != scale) {
+ return Status::Invalid(
+ fmt::format("in SimpleStatsConverter decimal scale
mismatch: min "
+ "decimal scale {}, target type scale
{}",
+ min_decimal.Scale(), scale));
+ }
+ min_writer.WriteDecimal(i, min_decimal, precision);
+ }
+ if (typed_stats->Max() == std::nullopt) {
+ if (!Decimal::IsCompact(precision)) {
+ max_writer.WriteDecimal(i, std::nullopt, precision);
+ } else {
+ max_writer.SetNullAt(i);
+ }
+ } else {
+ auto max_decimal = typed_stats->Max().value();
+ if (max_decimal.Scale() != scale) {
+ return Status::Invalid(
+ fmt::format("in SimpleStatsConverter decimal scale
mismatch: max "
+ "decimal scale {}, target type scale
{}",
+ max_decimal.Scale(), scale));
+ }
+ max_writer.WriteDecimal(i, max_decimal, precision);
+ }
+ break;
+ }
+ default:
+ return Status::Invalid(fmt::format("invalid type {} for
SimpleStatsConverter",
+
static_cast<int32_t>(type)));
+ }
+ }
+ min_writer.Complete();
+ max_writer.Complete();
+ null_writer.Complete();
+ return SimpleStats(min_values, max_values, null_counts);
+}
+
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_converter.h
b/src/paimon/core/stats/simple_stats_converter.h
new file mode 100644
index 0000000..903cf6e
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_converter.h
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paimon/format/column_stats.h"
+#include "paimon/result.h"
+
+namespace paimon {
+
+class SimpleStats;
+class ColumnStats;
+class MemoryPool;
+
+class SimpleStatsConverter {
+ public:
+ static Result<SimpleStats> ToBinary(const
std::vector<std::shared_ptr<ColumnStats>>& stats,
+ MemoryPool* pool);
+};
+
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_evolution.cpp
b/src/paimon/core/stats/simple_stats_evolution.cpp
new file mode 100644
index 0000000..a9356ed
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_evolution.cpp
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/core/stats/simple_stats_evolution.h"
+
+#include <cassert>
+#include <cstddef>
+#include <string_view>
+
+#include "paimon/common/data/binary_array.h"
+#include "paimon/common/data/binary_array_writer.h"
+#include "paimon/common/data/binary_row.h"
+#include "paimon/common/data/binary_string.h"
+#include "paimon/common/data/generic_row.h"
+#include "paimon/common/data/internal_array.h"
+#include "paimon/common/utils/object_utils.h"
+#include "paimon/common/utils/projected_array.h"
+#include "paimon/common/utils/projected_row.h"
+#include "paimon/core/casting/casted_row.h"
+#include "paimon/core/stats/simple_stats.h"
+#include "paimon/core/utils/field_mapping.h"
+#include "paimon/data/decimal.h"
+#include "paimon/data/timestamp.h"
+#include "paimon/status.h"
+
+namespace paimon {
+class Bytes;
+class CastExecutor;
+class InternalMap;
+class InternalRow;
+class MemoryPool;
+
+SimpleStatsEvolution::SimpleStatsEvolution(const std::vector<DataField>&
data_fields,
+ const std::vector<DataField>&
table_fields,
+ bool need_mapping,
+ const std::shared_ptr<MemoryPool>&
pool)
+ : need_mapping_(need_mapping), data_fields_(data_fields),
table_fields_(table_fields) {
+ empty_values_ = std::make_shared<GenericRow>(data_fields.size());
+ empty_null_counts_ = std::make_shared<BinaryArray>();
+ BinaryArrayWriter array_writer(empty_null_counts_.get(),
data_fields.size(),
+ /*element_size=*/sizeof(int64_t),
pool.get());
+ for (size_t i = 0; i < data_fields.size(); ++i) {
+ array_writer.SetNullAt(i);
+ }
+ array_writer.Complete();
+
+ for (size_t i = 0; i < data_fields.size(); i++) {
+ const auto& data_field = data_fields[i];
+ id_to_data_fields_.emplace(data_field.Id(), std::make_pair(i,
data_field));
+ }
+ std::map<std::string, DataField> name_to_table_fields;
+ for (const auto& field : table_fields) {
+ name_to_table_fields_.emplace(field.Name(), field);
+ }
+}
+
+class NullCountsEvoArray : public InternalArray {
+ public:
+ NullCountsEvoArray(const std::shared_ptr<InternalArray>& array,
+ const std::vector<int32_t>& mapping, int64_t
not_found_value)
+ : array_(array), mapping_(mapping), not_found_value_(not_found_value) {
+ assert(array_);
+ }
+ int32_t Size() const override {
+ return mapping_.size();
+ }
+ bool IsNullAt(int32_t pos) const override {
+ assert(static_cast<size_t>(pos) < mapping_.size());
+ if (mapping_[pos] < 0) {
+ return false;
+ }
+ return array_->IsNullAt(mapping_[pos]);
+ }
+ int64_t GetLong(int32_t pos) const override {
+ if (mapping_[pos] < 0) {
+ return not_found_value_;
+ }
+ return array_->GetLong(mapping_[pos]);
+ }
+
+ private:
+ // ============================= Unsupported Methods
================================
+ bool GetBoolean(int32_t pos) const override;
+ char GetByte(int32_t pos) const override;
+ int16_t GetShort(int32_t pos) const override;
+ int32_t GetInt(int32_t pos) const override;
+ int32_t GetDate(int32_t pos) const override;
+ float GetFloat(int32_t pos) const override;
+ double GetDouble(int32_t pos) const override;
+ BinaryString GetString(int32_t pos) const override;
+ std::string_view GetStringView(int32_t pos) const override;
+ Decimal GetDecimal(int32_t pos, int32_t precision, int32_t scale) const
override;
+ Timestamp GetTimestamp(int32_t pos, int32_t precision) const override;
+ std::shared_ptr<Bytes> GetBinary(int32_t pos) const override;
+ std::shared_ptr<InternalArray> GetArray(int32_t pos) const override;
+ std::shared_ptr<InternalMap> GetMap(int32_t pos) const override;
+ std::shared_ptr<InternalRow> GetRow(int32_t pos, int32_t num_fields) const
override;
+ Result<std::vector<char>> ToBooleanArray() const override;
+ Result<std::vector<char>> ToByteArray() const override;
+ Result<std::vector<int16_t>> ToShortArray() const override;
+ Result<std::vector<int32_t>> ToIntArray() const override;
+ Result<std::vector<int64_t>> ToLongArray() const override;
+ Result<std::vector<float>> ToFloatArray() const override;
+ Result<std::vector<double>> ToDoubleArray() const override;
+
+ private:
+ std::shared_ptr<InternalArray> array_;
+ std::vector<int32_t> mapping_;
+ int64_t not_found_value_;
+};
+
+Result<SimpleStatsEvolution::EvolutionStats> SimpleStatsEvolution::Evolution(
+ const SimpleStats& stats, int64_t row_count,
+ const std::optional<std::vector<std::string>>& dense_fields) {
+ SimpleStatsEvolution::EvolutionStats evolution_stats(
+ std::make_shared<BinaryRow>(stats.MinValues()),
+ std::make_shared<BinaryRow>(stats.MaxValues()),
+ std::make_shared<BinaryArray>(stats.NullCounts()));
+ auto& min_values = evolution_stats.min_values;
+ auto& max_values = evolution_stats.max_values;
+ auto& null_counts = evolution_stats.null_counts;
+ if (dense_fields != std::nullopt && dense_fields.value().empty()) {
+ // optimize for empty dense fields
+ min_values = empty_values_;
+ max_values = empty_values_;
+ null_counts = empty_null_counts_;
+ } else if (dense_fields != std::nullopt) {
+ // create dense index mapping
+ std::vector<int32_t> data_idx_to_dense_idx;
+ std::optional<std::vector<int32_t>> cached_dense_idx =
+ dense_fields_mapping_.Find(dense_fields.value());
+ if (!cached_dense_idx) {
+ std::map<std::string, int32_t> field_name_to_dense_idx =
+ ObjectUtils::CreateIdentifierToIndexMap(dense_fields.value());
+ data_idx_to_dense_idx.resize(data_fields_.size(), -1);
+ for (size_t i = 0; i < data_fields_.size(); ++i) {
+ auto iter =
field_name_to_dense_idx.find(data_fields_[i].Name());
+ if (iter != field_name_to_dense_idx.end()) {
+ data_idx_to_dense_idx[i] = iter->second;
+ }
+ }
+ dense_fields_mapping_.Insert(dense_fields.value(),
data_idx_to_dense_idx);
+ } else {
+ data_idx_to_dense_idx = std::move(cached_dense_idx).value();
+ }
+
+ min_values = std::make_shared<ProjectedRow>(min_values,
data_idx_to_dense_idx);
+ max_values = std::make_shared<ProjectedRow>(max_values,
data_idx_to_dense_idx);
+ null_counts = std::make_shared<ProjectedArray>(null_counts,
data_idx_to_dense_idx);
+ }
+
+ if (!need_mapping_) {
+ return evolution_stats;
+ }
+ return MappingStats(row_count, evolution_stats);
+}
+
+Result<SimpleStatsEvolution::EvolutionStats>
SimpleStatsEvolution::MappingStats(
+ int64_t row_count, const SimpleStatsEvolution::EvolutionStats& src_stats)
const {
+ SimpleStatsEvolution::EvolutionStats evolution_stats = src_stats;
+ auto& min_values = evolution_stats.min_values;
+ auto& max_values = evolution_stats.max_values;
+ auto& null_counts = evolution_stats.null_counts;
+
+ std::vector<int32_t> table_idx_to_data_idx(table_fields_.size(), -1);
+ std::vector<DataField> projected_data_fields;
+ projected_data_fields.reserve(table_fields_.size());
+ std::map<int32_t, int32_t> field_id_to_data_idx =
ObjectUtils::CreateIdentifierToIndexMap(
+ data_fields_, [](const DataField& field) -> int32_t { return
field.Id(); });
+ for (size_t i = 0; i < table_fields_.size(); ++i) {
+ auto iter = field_id_to_data_idx.find(table_fields_[i].Id());
+ if (iter != field_id_to_data_idx.end()) {
+ table_idx_to_data_idx[i] = iter->second;
+ projected_data_fields.push_back(data_fields_[iter->second]);
+ } else {
+ projected_data_fields.push_back(table_fields_[i]);
+ }
+ }
+ min_values = std::make_shared<ProjectedRow>(min_values,
table_idx_to_data_idx);
+ max_values = std::make_shared<ProjectedRow>(max_values,
table_idx_to_data_idx);
+ null_counts =
+ std::make_shared<NullCountsEvoArray>(null_counts,
table_idx_to_data_idx, row_count);
+
+ // cast mapping
+ PAIMON_ASSIGN_OR_RAISE(
+ std::vector<std::shared_ptr<CastExecutor>> cast_executors,
+ FieldMappingBuilder::CreateDataCastExecutors(table_fields_,
projected_data_fields));
+
+ bool need_cast = false;
+ for (const auto& executor : cast_executors) {
+ if (executor) {
+ need_cast = true;
+ break;
+ }
+ }
+ if (need_cast) {
+ PAIMON_ASSIGN_OR_RAISE(min_values, CastedRow::Create(cast_executors,
projected_data_fields,
+ table_fields_,
min_values));
+ PAIMON_ASSIGN_OR_RAISE(max_values, CastedRow::Create(cast_executors,
projected_data_fields,
+ table_fields_,
max_values));
+ }
+ return evolution_stats;
+}
+
+// ============================= Unsupported Methods
================================
+bool NullCountsEvoArray::GetBoolean(int32_t pos) const {
+ assert(false);
+ return array_->GetBoolean(mapping_[pos]);
+}
+
+char NullCountsEvoArray::GetByte(int32_t pos) const {
+ assert(false);
+ return array_->GetByte(mapping_[pos]);
+}
+
+int16_t NullCountsEvoArray::GetShort(int32_t pos) const {
+ assert(false);
+ return array_->GetShort(mapping_[pos]);
+}
+
+int32_t NullCountsEvoArray::GetInt(int32_t pos) const {
+ assert(false);
+ return array_->GetInt(mapping_[pos]);
+}
+
+int32_t NullCountsEvoArray::GetDate(int32_t pos) const {
+ return NullCountsEvoArray::GetInt(pos);
+}
+
+float NullCountsEvoArray::GetFloat(int32_t pos) const {
+ assert(false);
+ return array_->GetFloat(mapping_[pos]);
+}
+
+double NullCountsEvoArray::GetDouble(int32_t pos) const {
+ assert(false);
+ return array_->GetDouble(mapping_[pos]);
+}
+
+BinaryString NullCountsEvoArray::GetString(int32_t pos) const {
+ assert(false);
+ return array_->GetString(mapping_[pos]);
+}
+
+std::string_view NullCountsEvoArray::GetStringView(int32_t pos) const {
+ assert(false);
+ return array_->GetStringView(mapping_[pos]);
+}
+
+Decimal NullCountsEvoArray::GetDecimal(int32_t pos, int32_t precision, int32_t
scale) const {
+ assert(false);
+ return array_->GetDecimal(mapping_[pos], precision, scale);
+}
+
+Timestamp NullCountsEvoArray::GetTimestamp(int32_t pos, int32_t precision)
const {
+ assert(false);
+ return array_->GetTimestamp(mapping_[pos], precision);
+}
+
+std::shared_ptr<Bytes> NullCountsEvoArray::GetBinary(int32_t pos) const {
+ assert(false);
+ return array_->GetBinary(mapping_[pos]);
+}
+
+std::shared_ptr<InternalArray> NullCountsEvoArray::GetArray(int32_t pos) const
{
+ assert(false);
+ return array_->GetArray(mapping_[pos]);
+}
+
+std::shared_ptr<InternalMap> NullCountsEvoArray::GetMap(int32_t pos) const {
+ assert(false);
+ return array_->GetMap(mapping_[pos]);
+}
+
+std::shared_ptr<InternalRow> NullCountsEvoArray::GetRow(int32_t pos, int32_t
num_fields) const {
+ assert(false);
+ return array_->GetRow(mapping_[pos], num_fields);
+}
+
+Result<std::vector<char>> NullCountsEvoArray::ToBooleanArray() const {
+ return Status::Invalid("NullCountsEvoArray do not support convert to
boolean array");
+}
+Result<std::vector<char>> NullCountsEvoArray::ToByteArray() const {
+ return Status::Invalid("NullCountsEvoArray do not support convert to byte
array");
+}
+Result<std::vector<int16_t>> NullCountsEvoArray::ToShortArray() const {
+ return Status::Invalid("NullCountsEvoArray do not support convert to short
array");
+}
+Result<std::vector<int32_t>> NullCountsEvoArray::ToIntArray() const {
+ return Status::Invalid("NullCountsEvoArray do not support convert to int
array");
+}
+Result<std::vector<int64_t>> NullCountsEvoArray::ToLongArray() const {
+ return Status::Invalid("NullCountsEvoArray do not support convert to long
array");
+}
+Result<std::vector<float>> NullCountsEvoArray::ToFloatArray() const {
+ return Status::Invalid("NullCountsEvoArray do not support convert to float
array");
+}
+Result<std::vector<double>> NullCountsEvoArray::ToDoubleArray() const {
+ return Status::Invalid("NullCountsEvoArray do not support convert to
double array");
+}
+
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_evolution.h
b/src/paimon/core/stats/simple_stats_evolution.h
new file mode 100644
index 0000000..556e7df
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_evolution.h
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paimon/common/data/binary_array.h"
+#include "paimon/common/data/generic_row.h"
+#include "paimon/common/types/data_field.h"
+#include "paimon/common/utils/concurrent_hash_map.h"
+#include "paimon/core/stats/simple_stats.h"
+#include "paimon/result.h"
+
+namespace paimon {
+class BinaryArray;
+class GenericRow;
+class InternalArray;
+class InternalRow;
+class MemoryPool;
+class SimpleStats;
+
+/// Converter for array of `SimpleStats`.
+class SimpleStatsEvolution {
+ public:
+ SimpleStatsEvolution(const std::vector<DataField>& data_fields,
+ const std::vector<DataField>& table_fields, bool
need_mapping,
+ const std::shared_ptr<MemoryPool>& pool);
+
+ struct EvolutionStats {
+ EvolutionStats() = default;
+
+ EvolutionStats(const std::shared_ptr<InternalRow>& _min_values,
+ const std::shared_ptr<InternalRow>& _max_values,
+ const std::shared_ptr<InternalArray>& _null_counts)
+ : min_values(_min_values), max_values(_max_values),
null_counts(_null_counts) {}
+ std::shared_ptr<InternalRow> min_values;
+ std::shared_ptr<InternalRow> max_values;
+ std::shared_ptr<InternalArray> null_counts;
+ };
+
+ Result<EvolutionStats> Evolution(const SimpleStats& stats, int64_t
row_count,
+ const
std::optional<std::vector<std::string>>& dense_fields);
+
+ const std::map<int32_t, std::pair<int32_t, DataField>>&
GetFieldIdToDataField() const {
+ return id_to_data_fields_;
+ }
+
+ const std::map<std::string, DataField>& GetFieldNameToTableField() const {
+ return name_to_table_fields_;
+ }
+
+ private:
+ Result<SimpleStatsEvolution::EvolutionStats> MappingStats(
+ int64_t row_count, const SimpleStatsEvolution::EvolutionStats&
src_stats) const;
+
+ private:
+ bool need_mapping_ = false;
+ std::vector<DataField> data_fields_;
+ std::vector<DataField> table_fields_;
+ std::shared_ptr<GenericRow> empty_values_;
+ std::shared_ptr<BinaryArray> empty_null_counts_;
+
+ std::map<int32_t, std::pair<int32_t, DataField>> id_to_data_fields_;
+ std::map<std::string, DataField> name_to_table_fields_;
+
+ // dense field names -> idx mapping
+ ConcurrentHashMap<std::vector<std::string>, std::vector<int32_t>,
VectorStringHashCompare>
+ dense_fields_mapping_;
+};
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_evolution_test.cpp
b/src/paimon/core/stats/simple_stats_evolution_test.cpp
new file mode 100644
index 0000000..0281341
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_evolution_test.cpp
@@ -0,0 +1,479 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/core/stats/simple_stats_evolution.h"
+
+#include <cstddef>
+#include <functional>
+#include <string>
+#include <variant>
+
+#include "gtest/gtest.h"
+#include "paimon/common/data/binary_array.h"
+#include "paimon/common/data/binary_row.h"
+#include "paimon/common/data/data_define.h"
+#include "paimon/common/data/internal_array.h"
+#include "paimon/common/utils/internal_row_utils.h"
+#include "paimon/core/schema/schema_manager.h"
+#include "paimon/core/schema/table_schema.h"
+#include "paimon/core/stats/simple_stats.h"
+#include "paimon/data/decimal.h"
+#include "paimon/data/timestamp.h"
+#include "paimon/fs/local/local_file_system.h"
+#include "paimon/memory/memory_pool.h"
+#include "paimon/status.h"
+#include "paimon/testing/utils/binary_row_generator.h"
+#include "paimon/testing/utils/testharness.h"
+
+namespace arrow {
+class Schema;
+} // namespace arrow
+namespace paimon {
+class FileSystem;
+class InternalRow;
+} // namespace paimon
+
+namespace paimon::test {
+class SimpleStatsEvolutionTest : public ::testing::Test {
+ public:
+ void SetUp() override {
+ std::string old_schema_str = R"==({
+ "version" : 3,
+ "id" : 0,
+ "fields" : [ {
+ "id" : 0,
+ "name" : "key0",
+ "type" : "INT"
+ }, {
+ "id" : 1,
+ "name" : "key1",
+ "type" : "INT"
+ }, {
+ "id" : 2,
+ "name" : "f0",
+ "type" : "STRING"
+ }, {
+ "id" : 3,
+ "name" : "f1",
+ "type" : "TIMESTAMP(9)"
+ }, {
+ "id" : 4,
+ "name" : "f2",
+ "type" : "DECIMAL(5, 2)"
+ }, {
+ "id" : 5,
+ "name" : "f3",
+ "type" : "STRING"
+ }, {
+ "id" : 6,
+ "name" : "f4",
+ "type" : "DATE"
+ }, {
+ "id" : 7,
+ "name" : "f5",
+ "type" : "BIGINT"
+ } ],
+ "highestFieldId" : 7,
+ "partitionKeys" : [ "key0", "key1" ],
+ "primaryKeys" : [ ],
+ "options" : {
+ "manifest.format" : "orc",
+ "file.format" : "orc"
+ },
+ "timeMillis" : 1732604147800
+})==";
+
+ std::string new_schema_str = R"==({
+ "version" : 3,
+ "id" : 1,
+ "fields" : [ {
+ "id" : 6,
+ "name" : "f4",
+ "type" : "DATE"
+ }, {
+ "id" : 0,
+ "name" : "key0",
+ "type" : "INT"
+ }, {
+ "id" : 1,
+ "name" : "key1",
+ "type" : "INT"
+ }, {
+ "id" : 2,
+ "name" : "f3",
+ "type" : "INT"
+ }, {
+ "id" : 3,
+ "name" : "f1",
+ "type" : "TIMESTAMP(9)"
+ }, {
+ "id" : 4,
+ "name" : "f2",
+ "type" : "DECIMAL(6, 3)"
+ }, {
+ "id" : 5,
+ "name" : "f0",
+ "type" : "BOOLEAN"
+ }, {
+ "id" : 8,
+ "name" : "f6",
+ "type" : "INT"
+ } ],
+ "highestFieldId" : 8,
+ "partitionKeys" : [ "key0", "key1" ],
+ "primaryKeys" : [ ],
+ "options" : {
+ "manifest.format" : "orc",
+ "file.format" : "orc"
+ },
+ "timeMillis" : 1732605243483
+})==";
+
+ ASSERT_OK_AND_ASSIGN(old_schema_,
TableSchema::CreateFromJson(old_schema_str));
+ ASSERT_OK_AND_ASSIGN(new_schema_,
TableSchema::CreateFromJson(new_schema_str));
+ }
+
+ SimpleStats CreateStats() const {
+ auto min = BinaryRowGenerator::GenerateRow(
+ {0, 1, std::string("100"), TimestampType(Timestamp(0, 0), 9),
Decimal(5, 2, 12345),
+ std::string("nO"), 10, NullType()},
+ pool_.get());
+ auto max = BinaryRowGenerator::GenerateRow(
+ {20, 21, std::string("110"), TimestampType(Timestamp(0, 1), 9),
Decimal(5, 2, 32345),
+ std::string("Yes"), 20, NullType()},
+ pool_.get());
+ auto null_count = BinaryArray::FromLongArray({0, 0, 1, 0, 0, 0, 1, 4},
pool_.get());
+ return SimpleStats(min, max, null_count);
+ }
+
+ SimpleStats CreateNewStats() const {
+ auto min =
+ BinaryRowGenerator::GenerateRow({10, 0, 1, 100,
TimestampType(Timestamp(0, 0), 9),
+ Decimal(6, 3, 123450), false,
NullType()},
+ pool_.get());
+ auto max =
+ BinaryRowGenerator::GenerateRow({20, 20, 21, 110,
TimestampType(Timestamp(0, 1), 9),
+ Decimal(6, 3, 323450), true,
NullType()},
+ pool_.get());
+ auto null_count = BinaryArray::FromLongArray({1, 0, 0, 1, 0, 0, 0, 4},
pool_.get());
+ return SimpleStats(min, max, null_count);
+ }
+
+ void CheckResultRow(const InternalRow& result, const InternalRow& expected,
+ const std::vector<DataField>& fields) const {
+ auto collect_variant =
+ [](const InternalRow& row,
+ const std::shared_ptr<arrow::Schema>& schema) ->
std::vector<VariantType> {
+ EXPECT_OK_AND_ASSIGN(auto getters,
+ InternalRowUtils::CreateFieldGetters(schema,
/*use_view=*/true));
+ std::vector<VariantType> ret;
+ for (const auto& getter : getters) {
+ ret.push_back(getter(row));
+ }
+ return ret;
+ };
+ auto schema = DataField::ConvertDataFieldsToArrowSchema(fields);
+ auto result_variants = collect_variant(result, schema);
+ auto expected_variants = collect_variant(expected, schema);
+ ASSERT_EQ(result_variants, expected_variants);
+ }
+
+ void CheckResultArray(const InternalArray& result, const
std::vector<VariantType>& expected) {
+ ASSERT_EQ(result.Size(), expected.size());
+ for (size_t i = 0; i < expected.size(); ++i) {
+ if (DataDefine::IsVariantNull(expected[i])) {
+ ASSERT_TRUE(result.IsNullAt(i));
+ } else {
+ ASSERT_FALSE(result.IsNullAt(i));
+ ASSERT_EQ(result.GetLong(i),
DataDefine::GetVariantValue<int64_t>(expected[i]));
+ }
+ }
+ }
+
+ private:
+ std::shared_ptr<MemoryPool> pool_ = GetDefaultPool();
+ std::shared_ptr<FileSystem> fs_ = std::make_shared<LocalFileSystem>();
+ std::shared_ptr<TableSchema> old_schema_;
+ std::shared_ptr<TableSchema> new_schema_;
+};
+
+TEST_F(SimpleStatsEvolutionTest, TestNoChangeOfFields) {
+ std::string table_root =
+ paimon::test::GetDataDir() +
+
"orc/append_table_alter_table_with_cast.db/append_table_alter_table_with_cast";
+ SchemaManager manager(fs_, table_root);
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<TableSchema> schema,
manager.ReadSchema(/*schema_id=*/0));
+ ASSERT_TRUE(schema);
+
+ SimpleStatsEvolution evo(schema->Fields(), schema->Fields(),
/*need_mapping=*/false, pool_);
+ SimpleStats stats = CreateStats();
+ ASSERT_OK_AND_ASSIGN(SimpleStatsEvolution::EvolutionStats new_stats,
+ evo.Evolution(stats, /*row_count=*/4,
/*dense_fields=*/std::nullopt));
+ CheckResultRow(*(new_stats.min_values), stats.min_values_,
schema->Fields());
+ CheckResultRow(*(new_stats.max_values), stats.max_values_,
schema->Fields());
+ CheckResultArray(*(new_stats.null_counts), {0l, 0l, 1l, 0l, 0l, 0l, 1l,
4l});
+}
+
+TEST_F(SimpleStatsEvolutionTest, TestNoSchemaChangeWithEmptyDenseFields) {
+ std::string table_root =
+ paimon::test::GetDataDir() +
+
"orc/append_table_alter_table_with_cast.db/append_table_alter_table_with_cast";
+ SchemaManager manager(fs_, table_root);
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<TableSchema> schema,
manager.ReadSchema(/*schema_id=*/0));
+ ASSERT_TRUE(schema);
+
+ SimpleStatsEvolution evo(schema->Fields(), schema->Fields(),
/*need_mapping=*/false, pool_);
+
+ auto null_count = BinaryArray::FromLongArray(std::vector<int64_t>(),
pool_.get());
+ SimpleStats stats(/*min_values=*/BinaryRow::EmptyRow(),
/*max_values=*/BinaryRow::EmptyRow(),
+ null_count);
+
+ ASSERT_OK_AND_ASSIGN(SimpleStatsEvolution::EvolutionStats new_stats,
+ evo.Evolution(stats, /*row_count=*/4,
+
/*dense_fields=*/std::vector<std::string>({})));
+
+ auto expected_min =
+ BinaryRowGenerator::GenerateRow({NullType(), NullType(), NullType(),
NullType(), NullType(),
+ NullType(), NullType(), NullType()},
+ pool_.get());
+ auto expected_max =
+ BinaryRowGenerator::GenerateRow({NullType(), NullType(), NullType(),
NullType(), NullType(),
+ NullType(), NullType(), NullType()},
+ pool_.get());
+
+ CheckResultRow(*(new_stats.min_values), expected_min, schema->Fields());
+ CheckResultRow(*(new_stats.max_values), expected_max, schema->Fields());
+ CheckResultArray(*(new_stats.null_counts), {NullType(), NullType(),
NullType(), NullType(),
+ NullType(), NullType(),
NullType(), NullType()});
+}
+
+TEST_F(SimpleStatsEvolutionTest, TestNoSchemaChangeWithDenseFields) {
+ std::string table_root =
+ paimon::test::GetDataDir() +
+
"orc/append_table_alter_table_with_cast.db/append_table_alter_table_with_cast";
+ SchemaManager manager(fs_, table_root);
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<TableSchema> schema,
manager.ReadSchema(/*schema_id=*/0));
+ ASSERT_TRUE(schema);
+
+ SimpleStatsEvolution evo(schema->Fields(), schema->Fields(),
/*need_mapping=*/false, pool_);
+
+ auto min = BinaryRowGenerator::GenerateRow(
+ {0, 1, TimestampType(Timestamp(0, 1), 9), NullType()}, pool_.get());
+ auto max = BinaryRowGenerator::GenerateRow(
+ {20, 21, TimestampType(Timestamp(20, 21), 9), NullType()},
pool_.get());
+ auto null_count = BinaryArray::FromLongArray({0, 0, 1, 4}, pool_.get());
+ SimpleStats stats(min, max, null_count);
+
+ ASSERT_OK_AND_ASSIGN(
+ SimpleStatsEvolution::EvolutionStats new_stats,
+ evo.Evolution(stats, /*row_count=*/4,
+ /*dense_fields=*/std::vector<std::string>({"key0",
"key1", "f1", "f5"})));
+
+ auto expected_min =
+ BinaryRowGenerator::GenerateRow({0, 1, NullType(),
TimestampType(Timestamp(0, 1), 9),
+ NullType(), NullType(), NullType(),
NullType()},
+ pool_.get());
+ auto expected_max =
+ BinaryRowGenerator::GenerateRow({20, 21, NullType(),
TimestampType(Timestamp(20, 21), 9),
+ NullType(), NullType(), NullType(),
NullType()},
+ pool_.get());
+
+ CheckResultRow(*(new_stats.min_values), expected_min, schema->Fields());
+ CheckResultRow(*(new_stats.max_values), expected_max, schema->Fields());
+ CheckResultArray(*(new_stats.null_counts),
+ {0l, 0l, NullType(), 1l, NullType(), NullType(),
NullType(), 4l});
+}
+
+TEST_F(SimpleStatsEvolutionTest, TestSchemaChangeWithNoDenseFields) {
+ SimpleStatsEvolution evo(old_schema_->Fields(), new_schema_->Fields(),
/*need_mapping=*/true,
+ pool_);
+
+ SimpleStats stats = CreateStats();
+ SimpleStats expected_stats = CreateNewStats();
+
+ ASSERT_OK_AND_ASSIGN(SimpleStatsEvolution::EvolutionStats result_stats,
+ evo.Evolution(stats, /*row_count=*/4,
+ /*dense_fields=*/std::nullopt));
+
+ CheckResultRow(*(result_stats.min_values), expected_stats.min_values_,
new_schema_->Fields());
+ CheckResultRow(*(result_stats.max_values), expected_stats.max_values_,
new_schema_->Fields());
+ CheckResultArray(*(result_stats.null_counts), {1l, 0l, 0l, 1l, 0l, 0l, 0l,
4l});
+}
+
+TEST_F(SimpleStatsEvolutionTest, TestSchemaChangeWithDenseFields) {
+ SimpleStatsEvolution evo(old_schema_->Fields(), new_schema_->Fields(),
/*need_mapping=*/true,
+ pool_);
+
+ auto min = BinaryRowGenerator::GenerateRow({std::string("no"), 10, 1234l},
pool_.get());
+ auto max = BinaryRowGenerator::GenerateRow({std::string("yes"), 20,
2234l}, pool_.get());
+ auto null_count = BinaryArray::FromLongArray({0, 1, 1}, pool_.get());
+ SimpleStats old_stats(min, max, null_count);
+
+ ASSERT_OK_AND_ASSIGN(
+ SimpleStatsEvolution::EvolutionStats result_stats,
+ evo.Evolution(old_stats, /*row_count=*/4,
+ /*dense_fields=*/std::vector<std::string>({"f3", "f4",
"f5"})));
+
+ auto expected_min = BinaryRowGenerator::GenerateRow(
+ {10, NullType(), NullType(), NullType(), NullType(), NullType(),
false, NullType()},
+ pool_.get());
+ auto expected_max = BinaryRowGenerator::GenerateRow(
+ {20, NullType(), NullType(), NullType(), NullType(), NullType(), true,
NullType()},
+ pool_.get());
+
+ CheckResultRow(*(result_stats.min_values), expected_min,
new_schema_->Fields());
+ CheckResultRow(*(result_stats.max_values), expected_max,
new_schema_->Fields());
+ CheckResultArray(*(result_stats.null_counts),
+ {1l, NullType(), NullType(), NullType(), NullType(),
NullType(), 0l, 4l});
+}
+
+TEST_F(SimpleStatsEvolutionTest, TestNoSchemaChangeWithDenseFieldsWithMap) {
+ std::string table_root =
+ paimon::test::GetDataDir() +
+
"orc/append_table_alter_table_with_cast.db/append_table_alter_table_with_cast";
+ SchemaManager manager(fs_, table_root);
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<TableSchema> schema,
manager.ReadSchema(/*schema_id=*/0));
+ ASSERT_TRUE(schema);
+
+ SimpleStatsEvolution evo(schema->Fields(), schema->Fields(),
/*need_mapping=*/false, pool_);
+ {
+ // first turn
+ auto min = BinaryRowGenerator::GenerateRow(
+ {0, 1, TimestampType(Timestamp(0, 1), 9), NullType()},
pool_.get());
+ auto max = BinaryRowGenerator::GenerateRow(
+ {20, 21, TimestampType(Timestamp(20, 21), 9), NullType()},
pool_.get());
+ auto null_count = BinaryArray::FromLongArray({0, 0, 1, 4},
pool_.get());
+ SimpleStats stats(min, max, null_count);
+
+ ASSERT_OK_AND_ASSIGN(
+ SimpleStatsEvolution::EvolutionStats new_stats,
+ evo.Evolution(stats, /*row_count=*/4,
+ /*dense_fields=*/std::vector<std::string>({"key0",
"key1", "f1", "f5"})));
+
+ auto expected_min =
+ BinaryRowGenerator::GenerateRow({0, 1, NullType(),
TimestampType(Timestamp(0, 1), 9),
+ NullType(), NullType(),
NullType(), NullType()},
+ pool_.get());
+ auto expected_max = BinaryRowGenerator::GenerateRow(
+ {20, 21, NullType(), TimestampType(Timestamp(20, 21), 9),
NullType(), NullType(),
+ NullType(), NullType()},
+ pool_.get());
+
+ CheckResultRow(*(new_stats.min_values), expected_min,
schema->Fields());
+ CheckResultRow(*(new_stats.max_values), expected_max,
schema->Fields());
+ CheckResultArray(*(new_stats.null_counts),
+ {0l, 0l, NullType(), 1l, NullType(), NullType(),
NullType(), 4l});
+ ASSERT_EQ(evo.dense_fields_mapping_.Size(), 1);
+ }
+ {
+ // second turn, evo with same dense_fields
+ auto min = BinaryRowGenerator::GenerateRow(
+ {0, 1, TimestampType(Timestamp(0, 1), 9), NullType()},
pool_.get());
+ auto max = BinaryRowGenerator::GenerateRow(
+ {20, 21, TimestampType(Timestamp(20, 21), 9), NullType()},
pool_.get());
+ auto null_count = BinaryArray::FromLongArray({0, 0, 1, 4},
pool_.get());
+ SimpleStats stats(min, max, null_count);
+ ASSERT_OK_AND_ASSIGN(
+ SimpleStatsEvolution::EvolutionStats new_stats,
+ evo.Evolution(stats, /*row_count=*/4,
+ /*dense_fields=*/std::vector<std::string>({"key0",
"key1", "f1", "f5"})));
+
+ auto expected_min =
+ BinaryRowGenerator::GenerateRow({0, 1, NullType(),
TimestampType(Timestamp(0, 1), 9),
+ NullType(), NullType(),
NullType(), NullType()},
+ pool_.get());
+ auto expected_max = BinaryRowGenerator::GenerateRow(
+ {20, 21, NullType(), TimestampType(Timestamp(20, 21), 9),
NullType(), NullType(),
+ NullType(), NullType()},
+ pool_.get());
+
+ CheckResultRow(*(new_stats.min_values), expected_min,
schema->Fields());
+ CheckResultRow(*(new_stats.max_values), expected_max,
schema->Fields());
+ CheckResultArray(*(new_stats.null_counts),
+ {0l, 0l, NullType(), 1l, NullType(), NullType(),
NullType(), 4l});
+ ASSERT_EQ(evo.dense_fields_mapping_.Size(), 1);
+ }
+ {
+ // third turn, evo with different dense_fields
+ auto min = BinaryRowGenerator::GenerateRow({TimestampType(Timestamp(0,
1), 9), NullType()},
+ pool_.get());
+ auto max = BinaryRowGenerator::GenerateRow(
+ {TimestampType(Timestamp(20, 21), 9), NullType()}, pool_.get());
+ auto null_count = BinaryArray::FromLongArray({1, 4}, pool_.get());
+ SimpleStats stats(min, max, null_count);
+ ASSERT_OK_AND_ASSIGN(
+ SimpleStatsEvolution::EvolutionStats new_stats,
+ evo.Evolution(stats, /*row_count=*/4,
+ /*dense_fields=*/std::vector<std::string>({"f1",
"f5"})));
+
+ auto expected_min = BinaryRowGenerator::GenerateRow(
+ {NullType(), NullType(), NullType(), TimestampType(Timestamp(0,
1), 9), NullType(),
+ NullType(), NullType(), NullType()},
+ pool_.get());
+ auto expected_max = BinaryRowGenerator::GenerateRow(
+ {NullType(), NullType(), NullType(), TimestampType(Timestamp(20,
21), 9), NullType(),
+ NullType(), NullType(), NullType()},
+ pool_.get());
+
+ CheckResultRow(*(new_stats.min_values), expected_min,
schema->Fields());
+ CheckResultRow(*(new_stats.max_values), expected_max,
schema->Fields());
+ CheckResultArray(*(new_stats.null_counts), {NullType(), NullType(),
NullType(), 1l,
+ NullType(), NullType(),
NullType(), 4l});
+ ASSERT_EQ(evo.dense_fields_mapping_.Size(), 2);
+ }
+}
+
+TEST_F(SimpleStatsEvolutionTest, TestGetFieldMap) {
+ std::string table_root =
+ paimon::test::GetDataDir() +
+
"orc/append_table_alter_table_with_cast.db/append_table_alter_table_with_cast";
+ SchemaManager manager(fs_, table_root);
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<TableSchema> schema,
manager.ReadSchema(/*schema_id=*/0));
+ ASSERT_TRUE(schema);
+
+ ASSERT_OK_AND_ASSIGN(std::shared_ptr<TableSchema> schema1,
manager.ReadSchema(/*schema_id=*/1));
+ ASSERT_TRUE(schema1);
+
+ SimpleStatsEvolution evo(schema->Fields(), schema1->Fields(),
/*need_mapping=*/true, pool_);
+
+ std::map<int32_t, std::pair<int32_t, DataField>>
expected_id_to_data_fields;
+ expected_id_to_data_fields[0] = std::make_pair(0,
schema->GetField("key0").value());
+ expected_id_to_data_fields[1] = std::make_pair(1,
schema->GetField("key1").value());
+ expected_id_to_data_fields[2] = std::make_pair(2,
schema->GetField("f0").value());
+ expected_id_to_data_fields[3] = std::make_pair(3,
schema->GetField("f1").value());
+ expected_id_to_data_fields[4] = std::make_pair(4,
schema->GetField("f2").value());
+ expected_id_to_data_fields[5] = std::make_pair(5,
schema->GetField("f3").value());
+ expected_id_to_data_fields[6] = std::make_pair(6,
schema->GetField("f4").value());
+ expected_id_to_data_fields[7] = std::make_pair(7,
schema->GetField("f5").value());
+
+ std::map<std::string, DataField> expected_name_to_table_fields;
+ expected_name_to_table_fields["f4"] = schema1->GetField("f4").value();
+ expected_name_to_table_fields["key0"] = schema1->GetField("key0").value();
+ expected_name_to_table_fields["key1"] = schema1->GetField("key1").value();
+ expected_name_to_table_fields["f3"] = schema1->GetField("f3").value();
+ expected_name_to_table_fields["f1"] = schema1->GetField("f1").value();
+ expected_name_to_table_fields["f2"] = schema1->GetField("f2").value();
+ expected_name_to_table_fields["f0"] = schema1->GetField("f0").value();
+ expected_name_to_table_fields["f6"] = schema1->GetField("f6").value();
+
+ ASSERT_EQ(expected_id_to_data_fields, evo.GetFieldIdToDataField());
+ ASSERT_EQ(expected_name_to_table_fields, evo.GetFieldNameToTableField());
+}
+
+} // namespace paimon::test
diff --git a/src/paimon/core/stats/simple_stats_evolutions.h
b/src/paimon/core/stats/simple_stats_evolutions.h
new file mode 100644
index 0000000..c72a488
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_evolutions.h
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include "paimon/common/utils/concurrent_hash_map.h"
+#include "paimon/core/schema/table_schema.h"
+#include "paimon/core/stats/simple_stats_evolution.h"
+
+namespace paimon {
+class SimpleStatsEvolutions {
+ public:
+ SimpleStatsEvolutions(const std::shared_ptr<TableSchema>& table_schema,
+ const std::shared_ptr<MemoryPool>& pool)
+ : pool_(pool), table_schema_(table_schema) {}
+
+ std::shared_ptr<SimpleStatsEvolution> GetOrCreate(
+ const std::shared_ptr<TableSchema>& data_schema) {
+ auto data_schema_id = data_schema->Id();
+ auto cached_evolution = evolutions_.Find(data_schema_id);
+ if (cached_evolution != std::nullopt) {
+ return cached_evolution.value();
+ }
+ bool need_mapping = data_schema_id != table_schema_->Id();
+ auto evolution = std::make_shared<SimpleStatsEvolution>(
+ data_schema->Fields(), table_schema_->Fields(), need_mapping,
pool_);
+ evolutions_.Insert(data_schema_id, evolution);
+ return evolution;
+ }
+
+ private:
+ std::shared_ptr<MemoryPool> pool_;
+ std::shared_ptr<TableSchema> table_schema_;
+ // scheme_id -> evolution
+ ConcurrentHashMap<int64_t, std::shared_ptr<SimpleStatsEvolution>>
evolutions_;
+};
+} // namespace paimon
diff --git a/src/paimon/core/stats/simple_stats_test.cpp
b/src/paimon/core/stats/simple_stats_test.cpp
new file mode 100644
index 0000000..752534c
--- /dev/null
+++ b/src/paimon/core/stats/simple_stats_test.cpp
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "paimon/core/stats/simple_stats.h"
+
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paimon/testing/utils/binary_row_generator.h"
+#include "paimon/testing/utils/testharness.h"
+namespace paimon::test {
+TEST(SimpleStatsTest, TestToFromRow) {
+ auto pool = GetDefaultPool();
+ std::vector<SimpleStats> simple_stats_vec = {
+ BinaryRowGenerator::GenerateStats(
+ {false, static_cast<int8_t>(-2), static_cast<int16_t>(-32768),
+ static_cast<int32_t>(-2147483648), NullType(),
static_cast<int64_t>(-4294967298),
+ static_cast<float>(0.5), 1.141592659, "20250326", NullType()},
+ {true, static_cast<int8_t>(1), static_cast<int16_t>(32767),
+ static_cast<int32_t>(2147483647), NullType(),
static_cast<int64_t>(4294967296),
+ static_cast<float>(2.0), 3.141592657, "20250327", NullType()},
+ std::vector<int64_t>({1, 0, 0, 1, 4, 1, 0, 0, 1, 0}), pool.get()),
+ BinaryRowGenerator::GenerateStats({NullType()}, {NullType()}, {1},
pool.get()),
+ SimpleStats::EmptyStats(),
+ };
+ for (const auto& stats : simple_stats_vec) {
+ auto row = stats.ToRow();
+ ASSERT_OK_AND_ASSIGN(auto result_stats, SimpleStats::FromRow(&row,
pool.get()));
+ ASSERT_EQ(result_stats, stats);
+ ASSERT_EQ(result_stats.ToString(), stats.ToString());
+ }
+}
+
+} // namespace paimon::test