This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 1c431b6 feat: add satisfies order for SortField/SortOrder and
Transform (#284)
1c431b6 is described below
commit 1c431b6ad24c7a4265add773c2f9ccd6e0175fac
Author: Junwang Zhao <[email protected]>
AuthorDate: Wed Nov 5 15:11:46 2025 +0800
feat: add satisfies order for SortField/SortOrder and Transform (#284)
This PR also makes the `ToString` consistent with Java implementation.
---
src/iceberg/sort_field.cc | 14 ++++-
src/iceberg/sort_field.h | 6 ++
src/iceberg/sort_order.cc | 33 +++++++++-
src/iceberg/sort_order.h | 14 +++++
src/iceberg/test/sort_field_test.cc | 29 ++++++---
src/iceberg/test/sort_order_test.cc | 78 ++++++++++++++++++++---
src/iceberg/test/transform_test.cc | 121 ++++++++++++++++++++++++++++++++++++
src/iceberg/transform.cc | 47 ++++++++++++++
src/iceberg/transform.h | 14 +++++
9 files changed, 336 insertions(+), 20 deletions(-)
diff --git a/src/iceberg/sort_field.cc b/src/iceberg/sort_field.cc
index e9b3bf7..29083ba 100644
--- a/src/iceberg/sort_field.cc
+++ b/src/iceberg/sort_field.cc
@@ -41,10 +41,18 @@ SortDirection SortField::direction() const { return
direction_; }
NullOrder SortField::null_order() const { return null_order_; }
+bool SortField::Satisfies(const SortField& other) const {
+ if (*this == other) {
+ return true;
+ } else if (source_id_ != other.source_id() || direction_ !=
other.direction() ||
+ null_order_ != other.null_order()) {
+ return false;
+ }
+ return transform_->SatisfiesOrderOf(*other.transform());
+}
+
std::string SortField::ToString() const {
- return std::format(
- "sort_field(source_id={}, transform={}, direction={}, null_order={})",
source_id_,
- *transform_, direction_, null_order_);
+ return std::format("{}({}) {} {}", *transform_, source_id_, direction_,
null_order_);
}
bool SortField::Equals(const SortField& other) const {
diff --git a/src/iceberg/sort_field.h b/src/iceberg/sort_field.h
index 459472c..223c573 100644
--- a/src/iceberg/sort_field.h
+++ b/src/iceberg/sort_field.h
@@ -107,9 +107,15 @@ class ICEBERG_EXPORT SortField : public util::Formattable {
/// \brief Get the null order.
NullOrder null_order() const;
+ /// \brief Checks whether this field's order satisfies another field's order.
+ bool Satisfies(const SortField& other) const;
+
std::string ToString() const override;
friend bool operator==(const SortField& lhs, const SortField& rhs) {
+ if (&lhs == &rhs) {
+ return true;
+ }
return lhs.Equals(rhs);
}
diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc
index 9db51c7..48a3066 100644
--- a/src/iceberg/sort_order.cc
+++ b/src/iceberg/sort_order.cc
@@ -20,6 +20,7 @@
#include "iceberg/sort_order.h"
#include <format>
+#include <ranges>
#include "iceberg/util/formatter.h" // IWYU pragma: keep
@@ -38,10 +39,38 @@ int32_t SortOrder::order_id() const { return order_id_; }
std::span<const SortField> SortOrder::fields() const { return fields_; }
+bool SortOrder::Satisfies(const SortOrder& other) const {
+ // any ordering satisfies an unsorted ordering
+ if (other.is_unsorted()) {
+ return true;
+ }
+
+ // this ordering cannot satisfy an ordering with more sort fields
+ if (fields_.size() < other.fields().size()) {
+ return false;
+ }
+
+ // this ordering has either more or the same number of sort fields
+ for (const auto& [field, other_field] : std::views::zip(fields_,
other.fields_)) {
+ if (!field.Satisfies(other_field)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool SortOrder::SameOrder(const SortOrder& other) const {
+ return fields_ == other.fields_;
+}
+
std::string SortOrder::ToString() const {
- std::string repr = std::format("sort_order[order_id<{}>,\n", order_id_);
+ std::string repr = "[";
for (const auto& field : fields_) {
- std::format_to(std::back_inserter(repr), " {}\n", field);
+ std::format_to(std::back_inserter(repr), "\n {}", field);
+ }
+ if (!fields_.empty()) {
+ repr.push_back('\n');
}
repr += "]";
return repr;
diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h
index 6e49153..e245a74 100644
--- a/src/iceberg/sort_order.h
+++ b/src/iceberg/sort_order.h
@@ -49,6 +49,20 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable {
/// \brief Get the list of sort fields.
std::span<const SortField> fields() const;
+ /// \brief Returns true if the sort order is sorted
+ bool is_sorted() const { return !fields_.empty(); }
+
+ /// \brief Returns true if the sort order is unsorted
+ /// A SortOrder is unsorted if it has no sort fields.
+ bool is_unsorted() const { return fields_.empty(); }
+
+ /// \brief Checks whether this order satisfies another order.
+ bool Satisfies(const SortOrder& other) const;
+
+ /// \brief Checks whether this order is equivalent to another order while
ignoring the
+ /// order id.
+ bool SameOrder(const SortOrder& other) const;
+
std::string ToString() const override;
friend bool operator==(const SortOrder& lhs, const SortOrder& rhs) {
diff --git a/src/iceberg/test/sort_field_test.cc
b/src/iceberg/test/sort_field_test.cc
index d8be799..751b8a0 100644
--- a/src/iceberg/test/sort_field_test.cc
+++ b/src/iceberg/test/sort_field_test.cc
@@ -36,14 +36,8 @@ TEST(SortFieldTest, Basics) {
EXPECT_EQ(*transform, *field.transform());
EXPECT_EQ(SortDirection::kAscending, field.direction());
EXPECT_EQ(NullOrder::kFirst, field.null_order());
- EXPECT_EQ(
- "sort_field(source_id=1, transform=identity, direction=asc, "
- "null_order=nulls-first)",
- field.ToString());
- EXPECT_EQ(
- "sort_field(source_id=1, transform=identity, direction=asc, "
- "null_order=nulls-first)",
- std::format("{}", field));
+ EXPECT_EQ(field.ToString(), "identity(1) asc nulls-first");
+ EXPECT_EQ(std::format("{}", field), "identity(1) asc nulls-first");
}
}
@@ -67,4 +61,23 @@ TEST(SortFieldTest, Equality) {
ASSERT_NE(field1, field5);
ASSERT_NE(field5, field1);
}
+
+TEST(SortFieldTest, Satisfies) {
+ const auto bucket_transform = Transform::Bucket(8);
+ const auto identity_transform = Transform::Identity();
+
+ SortField field1(1, bucket_transform, SortDirection::kAscending,
NullOrder::kFirst);
+ SortField field2(1, bucket_transform, SortDirection::kAscending,
NullOrder::kFirst);
+ SortField field3(1, identity_transform, SortDirection::kAscending,
NullOrder::kFirst);
+ SortField field4(1, bucket_transform, SortDirection::kDescending,
NullOrder::kFirst);
+ SortField field5(1, bucket_transform, SortDirection::kAscending,
NullOrder::kLast);
+ SortField field6(2, bucket_transform, SortDirection::kAscending,
NullOrder::kFirst);
+
+ EXPECT_TRUE(field1.Satisfies(field2)); // Same fields
+ EXPECT_FALSE(field1.Satisfies(field3)); // Different transform
+ EXPECT_FALSE(field1.Satisfies(field4)); // Different direction
+ EXPECT_FALSE(field1.Satisfies(field5)); // Different null order
+ EXPECT_FALSE(field1.Satisfies(field6)); // Different source_id
+}
+
} // namespace iceberg
diff --git a/src/iceberg/test/sort_order_test.cc
b/src/iceberg/test/sort_order_test.cc
index 590f30b..bb407af 100644
--- a/src/iceberg/test/sort_order_test.cc
+++ b/src/iceberg/test/sort_order_test.cc
@@ -48,13 +48,12 @@ TEST(SortOrderTest, Basics) {
ASSERT_EQ(st_field1, fields[0]);
ASSERT_EQ(st_field2, fields[1]);
auto sort_order_str =
- "sort_order[order_id<100>,\n"
- " sort_field(source_id=5, transform=identity, direction=asc, "
- "null_order=nulls-first)\n"
- " sort_field(source_id=7, transform=identity, direction=desc, "
- "null_order=nulls-first)\n]";
- EXPECT_EQ(sort_order_str, sort_order.ToString());
- EXPECT_EQ(sort_order_str, std::format("{}", sort_order));
+ "[\n"
+ " identity(5) asc nulls-first\n"
+ " identity(7) desc nulls-first\n"
+ "]";
+ EXPECT_EQ(sort_order.ToString(), sort_order_str);
+ EXPECT_EQ(std::format("{}", sort_order), sort_order_str);
}
}
@@ -84,4 +83,69 @@ TEST(SortOrderTest, Equality) {
ASSERT_NE(sort_order1, sort_order5);
ASSERT_NE(sort_order5, sort_order1);
}
+
+TEST(SortOrderTest, IsUnsorted) {
+ auto unsorted = SortOrder::Unsorted();
+ EXPECT_TRUE(unsorted->is_unsorted());
+ EXPECT_FALSE(unsorted->is_sorted());
+}
+
+TEST(SortOrderTest, IsSorted) {
+ SchemaField field1(5, "ts", iceberg::timestamp(), true);
+ auto identity_transform = Transform::Identity();
+ SortField st_field1(5, identity_transform, SortDirection::kAscending,
+ NullOrder::kFirst);
+ SortOrder sorted_order(100, {st_field1});
+
+ EXPECT_TRUE(sorted_order.is_sorted());
+ EXPECT_FALSE(sorted_order.is_unsorted());
+}
+
+TEST(SortOrderTest, Satisfies) {
+ SchemaField field1(5, "ts", iceberg::timestamp(), true);
+ SchemaField field2(7, "bar", iceberg::string(), true);
+ auto identity_transform = Transform::Identity();
+ auto bucket_transform = Transform::Bucket(8);
+
+ SortField st_field1(5, identity_transform, SortDirection::kAscending,
+ NullOrder::kFirst);
+ SortField st_field2(7, identity_transform, SortDirection::kDescending,
+ NullOrder::kFirst);
+ SortField st_field3(7, bucket_transform, SortDirection::kAscending,
NullOrder::kFirst);
+
+ SortOrder sort_order1(100, {st_field1, st_field2});
+ SortOrder sort_order2(101, {st_field1});
+ SortOrder sort_order3(102, {st_field1, st_field3});
+ SortOrder sort_order4(104, {st_field2});
+ auto unsorted = SortOrder::Unsorted();
+
+ // Any order satisfies an unsorted order, including unsorted itself
+ EXPECT_TRUE(unsorted->Satisfies(*unsorted));
+ EXPECT_TRUE(sort_order1.Satisfies(*unsorted));
+ EXPECT_TRUE(sort_order2.Satisfies(*unsorted));
+ EXPECT_TRUE(sort_order3.Satisfies(*unsorted));
+
+ // Unsorted does not satisfy any sorted order
+ EXPECT_FALSE(unsorted->Satisfies(sort_order1));
+ EXPECT_FALSE(unsorted->Satisfies(sort_order2));
+ EXPECT_FALSE(unsorted->Satisfies(sort_order3));
+
+ // A sort order satisfies itself
+ EXPECT_TRUE(sort_order1.Satisfies(sort_order1));
+ EXPECT_TRUE(sort_order2.Satisfies(sort_order2));
+ EXPECT_TRUE(sort_order3.Satisfies(sort_order3));
+
+ // A sort order with more fields satisfy one with fewer fields
+ EXPECT_TRUE(sort_order1.Satisfies(sort_order2));
+ EXPECT_TRUE(sort_order3.Satisfies(sort_order2));
+
+ // A sort order does not satisfy one with more fields
+ EXPECT_FALSE(sort_order2.Satisfies(sort_order1));
+ EXPECT_FALSE(sort_order2.Satisfies(sort_order3));
+
+ // A sort order does not satify one with different fields
+ EXPECT_FALSE(sort_order4.Satisfies(sort_order2));
+ EXPECT_FALSE(sort_order2.Satisfies(sort_order4));
+}
+
} // namespace iceberg
diff --git a/src/iceberg/test/transform_test.cc
b/src/iceberg/test/transform_test.cc
index 79d2564..e4352af 100644
--- a/src/iceberg/test/transform_test.cc
+++ b/src/iceberg/test/transform_test.cc
@@ -720,4 +720,125 @@ INSTANTIATE_TEST_SUITE_P(
.source =
Literal::Null(iceberg::string()),
.expected =
Literal::Null(iceberg::string())}));
+TEST(TransformPreservesOrderTest, PreservesOrder) {
+ struct Case {
+ std::string transform_str;
+ bool expected;
+ };
+
+ const std::vector<Case> cases = {
+ {.transform_str = "identity", .expected = true},
+ {.transform_str = "year", .expected = true},
+ {.transform_str = "month", .expected = true},
+ {.transform_str = "day", .expected = true},
+ {.transform_str = "hour", .expected = true},
+ {.transform_str = "void", .expected = false},
+ {.transform_str = "bucket[16]", .expected = false},
+ {.transform_str = "truncate[32]", .expected = true},
+ };
+
+ for (const auto& c : cases) {
+ auto transform = TransformFromString(c.transform_str);
+ ASSERT_TRUE(transform.has_value()) << "Failed to parse: " <<
c.transform_str;
+
+ EXPECT_EQ(transform.value()->PreservesOrder(), c.expected)
+ << "Unexpected result for transform: " << c.transform_str;
+ }
+}
+
+TEST(TransformSatisfiesOrderOfTest, SatisfiesOrderOf) {
+ struct Case {
+ std::string transform_str;
+ std::string other_transform_str;
+ bool expected;
+ };
+
+ const std::vector<Case> cases = {
+ // Identity satisfies all order-preserving transforms
+ {.transform_str = "identity", .other_transform_str = "identity",
.expected = true},
+ {.transform_str = "identity", .other_transform_str = "year", .expected =
true},
+ {.transform_str = "identity", .other_transform_str = "month", .expected
= true},
+ {.transform_str = "identity", .other_transform_str = "day", .expected =
true},
+ {.transform_str = "identity", .other_transform_str = "hour", .expected =
true},
+ {.transform_str = "identity",
+ .other_transform_str = "truncate[32]",
+ .expected = true},
+ {.transform_str = "identity",
+ .other_transform_str = "bucket[16]",
+ .expected = false},
+
+ // Truncate satisfies Truncate with smaller width
+ {.transform_str = "truncate[32]",
+ .other_transform_str = "truncate[16]",
+ .expected = true},
+ {.transform_str = "truncate[16]",
+ .other_transform_str = "truncate[16]",
+ .expected = true},
+ {.transform_str = "truncate[16]",
+ .other_transform_str = "truncate[32]",
+ .expected = false},
+ {.transform_str = "truncate[16]",
+ .other_transform_str = "bucket[32]",
+ .expected = false},
+
+ // Hour satisfies hour, day, month, and year
+ {.transform_str = "hour", .other_transform_str = "hour", .expected =
true},
+ {.transform_str = "hour", .other_transform_str = "day", .expected =
true},
+ {.transform_str = "hour", .other_transform_str = "month", .expected =
true},
+ {.transform_str = "hour", .other_transform_str = "year", .expected =
true},
+ {.transform_str = "hour", .other_transform_str = "identity", .expected =
false},
+ {.transform_str = "hour", .other_transform_str = "bucket[16]", .expected
= false},
+
+ // Day satisfies day, month, and year
+ {.transform_str = "day", .other_transform_str = "day", .expected = true},
+ {.transform_str = "day", .other_transform_str = "month", .expected =
true},
+ {.transform_str = "day", .other_transform_str = "year", .expected =
true},
+ {.transform_str = "day", .other_transform_str = "hour", .expected =
false},
+ {.transform_str = "day", .other_transform_str = "identity", .expected =
false},
+
+ // Month satisfies month and year
+ {.transform_str = "month", .other_transform_str = "month", .expected =
true},
+ {.transform_str = "month", .other_transform_str = "year", .expected =
true},
+ {.transform_str = "month", .other_transform_str = "day", .expected =
false},
+ {.transform_str = "month", .other_transform_str = "hour", .expected =
false},
+
+ // Year satisfies only year
+ {.transform_str = "year", .other_transform_str = "year", .expected =
true},
+ {.transform_str = "year", .other_transform_str = "month", .expected =
false},
+ {.transform_str = "year", .other_transform_str = "day", .expected =
false},
+ {.transform_str = "year", .other_transform_str = "hour", .expected =
false},
+
+ // Void satisfies no order-preserving transforms
+ {.transform_str = "void", .other_transform_str = "identity", .expected =
false},
+ {.transform_str = "void", .other_transform_str = "year", .expected =
false},
+ {.transform_str = "void", .other_transform_str = "month", .expected =
false},
+ {.transform_str = "void", .other_transform_str = "day", .expected =
false},
+ {.transform_str = "void", .other_transform_str = "hour", .expected =
false},
+
+ // Bucket satisfies only itself
+ {.transform_str = "bucket[16]",
+ .other_transform_str = "bucket[16]",
+ .expected = true},
+ {.transform_str = "bucket[16]",
+ .other_transform_str = "bucket[32]",
+ .expected = false},
+ {.transform_str = "bucket[16]",
+ .other_transform_str = "identity",
+ .expected = false},
+ };
+
+ for (const auto& c : cases) {
+ auto transform = TransformFromString(c.transform_str);
+ auto other_transform = TransformFromString(c.other_transform_str);
+
+ ASSERT_TRUE(transform.has_value()) << "Failed to parse: " <<
c.transform_str;
+ ASSERT_TRUE(other_transform.has_value())
+ << "Failed to parse: " << c.other_transform_str;
+
+ EXPECT_EQ(transform.value()->SatisfiesOrderOf(*other_transform.value()),
c.expected)
+ << "Unexpected result for transform: " << c.transform_str
+ << " and other transform: " << c.other_transform_str;
+ }
+}
+
} // namespace iceberg
diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc
index dcacf84..4724cc1 100644
--- a/src/iceberg/transform.cc
+++ b/src/iceberg/transform.cc
@@ -21,6 +21,7 @@
#include <format>
#include <regex>
+#include <utility>
#include "iceberg/transform_function.h"
#include "iceberg/type.h"
@@ -124,6 +125,52 @@ Result<std::shared_ptr<TransformFunction>> Transform::Bind(
}
}
+bool Transform::PreservesOrder() const {
+ switch (transform_type_) {
+ case TransformType::kUnknown:
+ case TransformType::kVoid:
+ case TransformType::kBucket:
+ return false;
+ case TransformType::kIdentity:
+ case TransformType::kTruncate:
+ case TransformType::kYear:
+ case TransformType::kMonth:
+ case TransformType::kDay:
+ case TransformType::kHour:
+ return true;
+ }
+ std::unreachable();
+}
+
+bool Transform::SatisfiesOrderOf(const Transform& other) const {
+ auto other_type = other.transform_type();
+ switch (transform_type_) {
+ case TransformType::kIdentity:
+ // ordering by value is the same as long as the other preserves order
+ return other.PreservesOrder();
+ case TransformType::kTruncate: {
+ if (other_type != TransformType::kTruncate) {
+ return false;
+ }
+ return std::get<int32_t>(param_) >= std::get<int32_t>(other.param_);
+ }
+ case TransformType::kHour:
+ return other_type == TransformType::kHour || other_type ==
TransformType::kDay ||
+ other_type == TransformType::kMonth || other_type ==
TransformType::kYear;
+ case TransformType::kDay:
+ return other_type == TransformType::kDay || other_type ==
TransformType::kMonth ||
+ other_type == TransformType::kYear;
+ case TransformType::kMonth:
+ return other_type == TransformType::kMonth || other_type ==
TransformType::kYear;
+ case TransformType::kYear:
+ case TransformType::kBucket:
+ case TransformType::kUnknown:
+ case TransformType::kVoid:
+ return *this == other;
+ }
+ std::unreachable();
+}
+
bool TransformFunction::Equals(const TransformFunction& other) const {
return transform_type_ == other.transform_type_ && *source_type_ ==
*other.source_type_;
}
diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h
index d06e31f..21483e0 100644
--- a/src/iceberg/transform.h
+++ b/src/iceberg/transform.h
@@ -150,6 +150,20 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
Result<std::shared_ptr<TransformFunction>> Bind(
const std::shared_ptr<Type>& source_type) const;
+ /// \brief Whether the transform preserves the order of values (is
monotonic).
+ bool PreservesOrder() const;
+
+ /// \brief Whether ordering by this transform's result satisfies the
ordering of another
+ /// transform's result.
+ ///
+ /// For example, sorting by day(ts) will produce an ordering that is also by
month(ts)
+ /// or year(ts). However, sorting by day(ts) will not satisfy the order of
hour(ts) or
+ /// identity(ts).
+ ///
+ /// \return true if ordering by this transform is equivalent to ordering by
the other
+ /// transform.
+ bool SatisfiesOrderOf(const Transform& other) const;
+
/// \brief Returns a string representation of this transform (e.g.,
"bucket[16]").
std::string ToString() const override;