This is an automated email from the ASF dual-hosted git repository.

gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new e759807  feat: add SortOrder::Make and SortOrder::Validate (#300)
e759807 is described below

commit e759807cfd739068d1a90a0d1d26c421c0d5ec70
Author: Junwang Zhao <[email protected]>
AuthorDate: Mon Nov 10 10:08:14 2025 +0800

    feat: add SortOrder::Make and SortOrder::Validate (#300)
---
 src/iceberg/sort_order.cc             |  64 +++++++++++++++++++-
 src/iceberg/sort_order.h              |  25 ++++++++
 src/iceberg/test/schema_field_test.cc |   2 +-
 src/iceberg/test/schema_test.cc       |   1 -
 src/iceberg/test/sort_order_test.cc   | 109 ++++++++++++++++++++++++++++++++++
 src/iceberg/test/transform_test.cc    |  73 +++++++++++++++++++++++
 src/iceberg/transform.cc              |  69 +++++++++++++++++++++
 src/iceberg/transform.h               |   5 ++
 8 files changed, 345 insertions(+), 3 deletions(-)

diff --git a/src/iceberg/sort_order.cc b/src/iceberg/sort_order.cc
index 48a3066..9e42b15 100644
--- a/src/iceberg/sort_order.cc
+++ b/src/iceberg/sort_order.cc
@@ -20,9 +20,15 @@
 #include "iceberg/sort_order.h"
 
 #include <format>
+#include <optional>
 #include <ranges>
 
+#include "iceberg/result.h"
+#include "iceberg/schema.h"
+#include "iceberg/sort_field.h"
+#include "iceberg/transform.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
+#include "iceberg/util/macros.h"
 
 namespace iceberg {
 
@@ -31,7 +37,7 @@ SortOrder::SortOrder(int32_t order_id, std::vector<SortField> 
fields)
 
 const std::shared_ptr<SortOrder>& SortOrder::Unsorted() {
   static const std::shared_ptr<SortOrder> unsorted =
-      std::make_shared<SortOrder>(/*order_id=*/0, std::vector<SortField>{});
+      std::make_shared<SortOrder>(kUnsortedOrderId, std::vector<SortField>{});
   return unsorted;
 }
 
@@ -80,4 +86,60 @@ bool SortOrder::Equals(const SortOrder& other) const {
   return order_id_ == other.order_id_ && fields_ == other.fields_;
 }
 
+Status SortOrder::Validate(const Schema& schema) const {
+  for (const auto& field : fields_) {
+    auto schema_field = schema.FindFieldById(field.source_id());
+    if (!schema_field.has_value() || schema_field.value() == std::nullopt) {
+      return InvalidArgument("Cannot find source column for sort field: {}", 
field);
+    }
+
+    const auto& source_type = schema_field.value().value().get().type();
+
+    if (!field.transform()->CanTransform(*source_type)) {
+      return InvalidArgument("Invalid source type {} for transform {}",
+                             source_type->ToString(), 
field.transform()->ToString());
+    }
+  }
+  return {};
+}
+
+Result<std::unique_ptr<SortOrder>> SortOrder::Make(const Schema& schema, 
int32_t sort_id,
+                                                   std::vector<SortField> 
fields) {
+  if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
+    return InvalidArgument("{} is reserved for unsorted sort order", 
kUnsortedOrderId);
+  }
+
+  if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
+    return InvalidArgument("Sort order must have at least one sort field");
+  }
+
+  for (const auto& field : fields) {
+    ICEBERG_ASSIGN_OR_RAISE(auto schema_field, 
schema.FindFieldById(field.source_id()));
+    if (schema_field == std::nullopt) {
+      return InvalidArgument("Cannot find source column for sort field: {}", 
field);
+    }
+
+    const auto& source_type = schema_field.value().get().type();
+    if (field.transform()->CanTransform(*source_type) == false) {
+      return InvalidArgument("Invalid source type {} for transform {}",
+                             source_type->ToString(), 
field.transform()->ToString());
+    }
+  }
+
+  return std::make_unique<SortOrder>(sort_id, std::move(fields));
+}
+
+Result<std::unique_ptr<SortOrder>> SortOrder::Make(int32_t sort_id,
+                                                   std::vector<SortField> 
fields) {
+  if (!fields.empty() && sort_id == kUnsortedOrderId) [[unlikely]] {
+    return InvalidArgument("{} is reserved for unsorted sort order", 
kUnsortedOrderId);
+  }
+
+  if (fields.empty() && sort_id != kUnsortedOrderId) [[unlikely]] {
+    return InvalidArgument("Sort order must have at least one sort field");
+  }
+
+  return std::make_unique<SortOrder>(sort_id, std::move(fields));
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/sort_order.h b/src/iceberg/sort_order.h
index e245a74..4a0b11a 100644
--- a/src/iceberg/sort_order.h
+++ b/src/iceberg/sort_order.h
@@ -20,11 +20,13 @@
 #pragma once
 
 #include <cstdint>
+#include <memory>
 #include <span>
 #include <vector>
 
 #include "iceberg/iceberg_export.h"
 #include "iceberg/sort_field.h"
+#include "iceberg/type_fwd.h"
 #include "iceberg/util/formattable.h"
 
 namespace iceberg {
@@ -36,6 +38,7 @@ namespace iceberg {
 /// applied to the data.
 class ICEBERG_EXPORT SortOrder : public util::Formattable {
  public:
+  static constexpr int32_t kUnsortedOrderId = 0;
   static constexpr int32_t kInitialSortOrderId = 1;
 
   SortOrder(int32_t order_id, std::vector<SortField> fields);
@@ -69,6 +72,28 @@ class ICEBERG_EXPORT SortOrder : public util::Formattable {
     return lhs.Equals(rhs);
   }
 
+  /// \brief Validates the sort order against a schema.
+  /// \param schema The schema to validate against.
+  /// \return Error status if the sort order has any invalid transform.
+  Status Validate(const Schema& schema) const;
+
+  /// \brief Create a SortOrder.
+  /// \param schema The schema to bind the sort order to.
+  /// \param sort_id The sort order id.
+  /// \param fields The sort fields.
+  /// \return A Result containing the SortOrder or an error.
+  static Result<std::unique_ptr<SortOrder>> Make(const Schema& schema, int32_t 
sort_id,
+                                                 std::vector<SortField> 
fields);
+
+  /// \brief Create a SortOrder without binding to a schema.
+  /// \param sort_id The sort order id.
+  /// \param fields The sort fields.
+  /// \return A Result containing the SortOrder or an error.
+  /// \note This method does not check whether the sort fields are valid for 
any schema.
+  /// Use IsBoundToSchema to check if the sort order is valid for a given 
schema.
+  static Result<std::unique_ptr<SortOrder>> Make(int32_t sort_id,
+                                                 std::vector<SortField> 
fields);
+
  private:
   /// \brief Compare two sort orders for equality.
   bool Equals(const SortOrder& other) const;
diff --git a/src/iceberg/test/schema_field_test.cc 
b/src/iceberg/test/schema_field_test.cc
index bc0dbbd..1078aad 100644
--- a/src/iceberg/test/schema_field_test.cc
+++ b/src/iceberg/test/schema_field_test.cc
@@ -63,7 +63,7 @@ TEST(SchemaFieldTest, Equality) {
   iceberg::SchemaField field1(1, "foo", iceberg::int32(), false);
   iceberg::SchemaField field2(2, "foo", iceberg::int32(), false);
   iceberg::SchemaField field3(1, "bar", iceberg::int32(), false);
-  iceberg::SchemaField field4(1, "foo", std::make_shared<iceberg::LongType>(), 
false);
+  iceberg::SchemaField field4(1, "foo", iceberg::int64(), false);
   iceberg::SchemaField field5(1, "foo", iceberg::int32(), true);
   iceberg::SchemaField field6(1, "foo", iceberg::int32(), false);
 
diff --git a/src/iceberg/test/schema_test.cc b/src/iceberg/test/schema_test.cc
index d99e9b3..89a8d54 100644
--- a/src/iceberg/test/schema_test.cc
+++ b/src/iceberg/test/schema_test.cc
@@ -25,7 +25,6 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-#include "gtest/gtest.h"
 #include "iceberg/result.h"
 #include "iceberg/schema_field.h"
 #include "iceberg/test/matchers.h"
diff --git a/src/iceberg/test/sort_order_test.cc 
b/src/iceberg/test/sort_order_test.cc
index bb407af..86bd525 100644
--- a/src/iceberg/test/sort_order_test.cc
+++ b/src/iceberg/test/sort_order_test.cc
@@ -26,11 +26,40 @@
 
 #include "iceberg/schema.h"
 #include "iceberg/sort_field.h"
+#include "iceberg/test/matchers.h"
 #include "iceberg/transform.h"
 #include "iceberg/util/formatter.h"  // IWYU pragma: keep
 
 namespace iceberg {
 
+class SortOrderMakeTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    field1_ = std::make_unique<SchemaField>(1, "x", int32(), true);
+    field2_ = std::make_unique<SchemaField>(2, "y", string(), true);
+    field3_ = std::make_unique<SchemaField>(3, "time", timestamp(), true);
+
+    schema_ = std::make_unique<Schema>(
+        std::vector<SchemaField>{*field1_, *field2_, *field3_}, 1);
+
+    sort_field1_ = std::make_unique<SortField>(
+        1, Transform::Identity(), SortDirection::kAscending, 
NullOrder::kFirst);
+    sort_field2_ = std::make_unique<SortField>(
+        2, Transform::Bucket(10), SortDirection::kDescending, 
NullOrder::kLast);
+    sort_field3_ = std::make_unique<SortField>(
+        3, Transform::Day(), SortDirection::kAscending, NullOrder::kFirst);
+  }
+
+  std::unique_ptr<Schema> schema_;
+  std::unique_ptr<SchemaField> field1_;
+  std::unique_ptr<SchemaField> field2_;
+  std::unique_ptr<SchemaField> field3_;
+
+  std::unique_ptr<SortField> sort_field1_;
+  std::unique_ptr<SortField> sort_field2_;
+  std::unique_ptr<SortField> sort_field3_;
+};
+
 TEST(SortOrderTest, Basics) {
   {
     SchemaField field1(5, "ts", iceberg::timestamp(), true);
@@ -148,4 +177,84 @@ TEST(SortOrderTest, Satisfies) {
   EXPECT_FALSE(sort_order2.Satisfies(sort_order4));
 }
 
+TEST_F(SortOrderMakeTest, MakeValidSortOrder) {
+  ICEBERG_UNWRAP_OR_FAIL(
+      auto sort_order,
+      SortOrder::Make(*schema_, 1, std::vector<SortField>{*sort_field1_, 
*sort_field2_}));
+  ASSERT_NE(sort_order, nullptr);
+
+  EXPECT_TRUE(sort_order->is_sorted());
+  ASSERT_EQ(sort_order->fields().size(), 2);
+  EXPECT_EQ(sort_order->fields()[0], *sort_field1_);
+  EXPECT_EQ(sort_order->fields()[1], *sort_field2_);
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderEmptyFields) {
+  auto sort_order = SortOrder::Make(*schema_, 1, std::vector<SortField>{});
+  EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+  EXPECT_THAT(sort_order,
+              HasErrorMessage("Sort order must have at least one sort field"));
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderUnsortedId) {
+  auto sort_order = SortOrder::Make(*schema_, SortOrder::kUnsortedOrderId,
+                                    std::vector<SortField>{*sort_field1_});
+  EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+  EXPECT_THAT(sort_order,
+              HasErrorMessage(std::format("{} is reserved for unsorted sort 
order",
+                                          SortOrder::kUnsortedOrderId)));
+}
+
+TEST_F(SortOrderMakeTest, MakeValidUnsortedSortOrder) {
+  ICEBERG_UNWRAP_OR_FAIL(auto sort_order, 
SortOrder::Make(SortOrder::kUnsortedOrderId,
+                                                          
std::vector<SortField>{}));
+  ASSERT_NE(sort_order, nullptr);
+
+  EXPECT_TRUE(sort_order->is_unsorted());
+  EXPECT_EQ(sort_order->fields().size(), 0);
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderNonPrimitiveField) {
+  auto struct_field = std::make_unique<SchemaField>(
+      4, "struct_field",
+      std::make_shared<StructType>(std::vector<SchemaField>{
+          SchemaField::MakeRequired(41, "inner_field", iceberg::int32()),
+      }),
+      true);
+
+  Schema schema_with_struct(
+      std::vector<SchemaField>{*field1_, *field2_, *field3_, *struct_field}, 
1);
+
+  SortField sort_field_invalid(4, Transform::Identity(), 
SortDirection::kAscending,
+                               NullOrder::kFirst);
+
+  auto sort_order = SortOrder::Make(
+      schema_with_struct, 1, std::vector<SortField>{*sort_field1_, 
sort_field_invalid});
+  EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+  EXPECT_THAT(sort_order, HasErrorMessage("Invalid source type"));
+}
+
+TEST_F(SortOrderMakeTest, MakeInvalidSortOrderFieldNotInSchema) {
+  SortField sort_field_invalid(999, Transform::Identity(), 
SortDirection::kAscending,
+                               NullOrder::kFirst);
+
+  auto sort_order = SortOrder::Make(
+      *schema_, 1, std::vector<SortField>{*sort_field1_, sort_field_invalid});
+  EXPECT_THAT(sort_order, IsError(ErrorKind::kInvalidArgument));
+  EXPECT_THAT(sort_order, HasErrorMessage("Cannot find source column for sort 
field"));
+}
+
+TEST_F(SortOrderMakeTest, MakeUnboundSortOrder) {
+  SortField sort_field_invalid(999, Transform::Identity(), 
SortDirection::kAscending,
+                               NullOrder::kFirst);
+
+  auto sort_order =
+      SortOrder::Make(1, std::vector<SortField>{*sort_field1_, 
sort_field_invalid});
+  ASSERT_THAT(sort_order, IsOk());
+  auto validate_status = sort_order.value()->Validate(*schema_);
+  EXPECT_THAT(validate_status, IsError(ErrorKind::kInvalidArgument));
+  EXPECT_THAT(validate_status,
+              HasErrorMessage("Cannot find source column for sort field"));
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/test/transform_test.cc 
b/src/iceberg/test/transform_test.cc
index e4352af..529297b 100644
--- a/src/iceberg/test/transform_test.cc
+++ b/src/iceberg/test/transform_test.cc
@@ -27,6 +27,7 @@
 #include <gtest/gtest.h>
 
 #include "iceberg/expression/literal.h"
+#include "iceberg/schema_field.h"
 #include "iceberg/test/matchers.h"
 #include "iceberg/test/temporal_test_helper.h"
 #include "iceberg/type.h"
@@ -841,4 +842,76 @@ TEST(TransformSatisfiesOrderOfTest, SatisfiesOrderOf) {
   }
 }
 
+TEST(TransformCanTransformTest, CanTransform) {
+  struct Case {
+    std::string transform_str;
+    std::shared_ptr<Type> source_type;
+    bool expected;
+  };
+
+  const std::vector<Case> cases = {
+      // Identity can transform all primitive types
+      {.transform_str = "identity", .source_type = int32(), .expected = true},
+      {.transform_str = "identity", .source_type = string(), .expected = true},
+      {.transform_str = "identity", .source_type = boolean(), .expected = 
true},
+      {.transform_str = "identity",
+       .source_type = list(SchemaField(123, "element", int32(), false)),
+       .expected = false},
+
+      // Void can transform any type
+      {.transform_str = "void", .source_type = iceberg::int32(), .expected = 
true},
+      {.transform_str = "void",
+       .source_type = iceberg::map(SchemaField(123, "key", iceberg::string(), 
false),
+                                   SchemaField(124, "value", iceberg::int32(), 
true)),
+       .expected = true},
+
+      // Bucket can transform specific types
+      {.transform_str = "bucket[16]", .source_type = iceberg::int32(), 
.expected = true},
+      {.transform_str = "bucket[16]", .source_type = iceberg::string(), 
.expected = true},
+      {.transform_str = "bucket[16]",
+       .source_type = iceberg::float32(),
+       .expected = false},
+
+      // Truncate can transform specific types
+      {.transform_str = "truncate[32]",
+       .source_type = iceberg::int32(),
+       .expected = true},
+      {.transform_str = "truncate[32]",
+       .source_type = iceberg::string(),
+       .expected = true},
+      {.transform_str = "truncate[32]",
+       .source_type = iceberg::boolean(),
+       .expected = false},
+
+      // Year can transform date and timestamp types
+      {.transform_str = "year", .source_type = iceberg::date(), .expected = 
true},
+      {.transform_str = "year", .source_type = iceberg::timestamp(), .expected 
= true},
+      {.transform_str = "year", .source_type = iceberg::string(), .expected = 
false},
+
+      // Month can transform date and timestamp types
+      {.transform_str = "month", .source_type = iceberg::date(), .expected = 
true},
+      {.transform_str = "month", .source_type = iceberg::timestamp(), 
.expected = true},
+      {.transform_str = "month", .source_type = iceberg::binary(), .expected = 
false},
+
+      // Day can transform date and timestamp types
+      {.transform_str = "day", .source_type = iceberg::date(), .expected = 
true},
+      {.transform_str = "day", .source_type = iceberg::timestamp(), .expected 
= true},
+      {.transform_str = "day", .source_type = iceberg::uuid(), .expected = 
false},
+
+      // Hour can transform timestamp types
+      {.transform_str = "hour", .source_type = iceberg::timestamp(), .expected 
= true},
+      {.transform_str = "hour", .source_type = iceberg::timestamp_tz(), 
.expected = true},
+      {.transform_str = "hour", .source_type = iceberg::int32(), .expected = 
false},
+  };
+
+  for (const auto& c : cases) {
+    auto transform = TransformFromString(c.transform_str);
+    ASSERT_TRUE(transform.has_value()) << "Failed to parse: " << 
c.transform_str;
+
+    EXPECT_EQ(transform.value()->CanTransform(*c.source_type), c.expected)
+        << "Unexpected result for transform: " << c.transform_str
+        << " and source type: " << c.source_type->ToString();
+  }
+}
+
 }  // namespace iceberg
diff --git a/src/iceberg/transform.cc b/src/iceberg/transform.cc
index 4724cc1..2e39cb0 100644
--- a/src/iceberg/transform.cc
+++ b/src/iceberg/transform.cc
@@ -125,6 +125,75 @@ Result<std::shared_ptr<TransformFunction>> Transform::Bind(
   }
 }
 
+bool Transform::CanTransform(const Type& source_type) const {
+  switch (transform_type_) {
+    case TransformType::kIdentity:
+      if (!source_type.is_primitive()) [[unlikely]] {
+        return false;
+      }
+      return true;
+    case TransformType::kVoid:
+    case TransformType::kUnknown:
+      return true;
+    case TransformType::kBucket:
+      switch (source_type.type_id()) {
+        case TypeId::kInt:
+        case TypeId::kLong:
+        case TypeId::kDecimal:
+        case TypeId::kDate:
+        case TypeId::kTime:
+        case TypeId::kTimestamp:
+        case TypeId::kTimestampTz:
+        case TypeId::kString:
+        case TypeId::kUuid:
+        case TypeId::kFixed:
+        case TypeId::kBinary:
+          return true;
+        default:
+          return false;
+      }
+    case TransformType::kTruncate:
+      switch (source_type.type_id()) {
+        case TypeId::kInt:
+        case TypeId::kLong:
+        case TypeId::kString:
+        case TypeId::kBinary:
+        case TypeId::kDecimal:
+          return true;
+        default:
+          return false;
+      }
+    case TransformType::kYear:
+    case TransformType::kMonth:
+      switch (source_type.type_id()) {
+        case TypeId::kDate:
+        case TypeId::kTimestamp:
+        case TypeId::kTimestampTz:
+          return true;
+        default:
+          return false;
+      }
+    case TransformType::kDay:
+      switch (source_type.type_id()) {
+        case TypeId::kDate:
+        case TypeId::kTimestamp:
+        case TypeId::kTimestampTz:
+          return true;
+        default:
+          return false;
+      }
+    case TransformType::kHour:
+      switch (source_type.type_id()) {
+        case TypeId::kTimestamp:
+        case TypeId::kTimestampTz:
+          return true;
+        default:
+          return false;
+      }
+  }
+  std::unreachable();
+}
+
 bool Transform::PreservesOrder() const {
   switch (transform_type_) {
     case TransformType::kUnknown:
diff --git a/src/iceberg/transform.h b/src/iceberg/transform.h
index 21483e0..4f608c4 100644
--- a/src/iceberg/transform.h
+++ b/src/iceberg/transform.h
@@ -150,6 +150,11 @@ class ICEBERG_EXPORT Transform : public util::Formattable {
   Result<std::shared_ptr<TransformFunction>> Bind(
       const std::shared_ptr<Type>& source_type) const;
 
+  /// \brief Checks whether this function can be applied to the given Type.
+  /// \param source_type The source type to check.
+  /// \return true if this transform can be applied to the type, false 
otherwise
+  bool CanTransform(const Type& source_type) const;
+
   /// \brief Whether the transform preserves the order of values (is 
monotonic).
   bool PreservesOrder() const;
 

Reply via email to