This is an automated email from the ASF dual-hosted git repository.
gangwu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push:
new 320a985 feat: add eval support to bound term (#320)
320a985 is described below
commit 320a985e6ede52ce05e513bbc198069333fea193
Author: Gang Wu <[email protected]>
AuthorDate: Wed Nov 19 16:11:24 2025 +0800
feat: add eval support to bound term (#320)
- add struct like accessor
- support schema to find accessor by field id
- bound term can evaluate struct-like
---
src/iceberg/CMakeLists.txt | 1 +
src/iceberg/expression/term.cc | 30 +++--
src/iceberg/expression/term.h | 6 +-
src/iceberg/meson.build | 1 +
src/iceberg/row/struct_like.cc | 129 ++++++++++++++++++
src/iceberg/row/struct_like.h | 29 ++++-
src/iceberg/schema.cc | 66 ++++++++++
src/iceberg/schema.h | 10 ++
src/iceberg/test/CMakeLists.txt | 6 +
src/iceberg/test/eval_expr_test.cc | 245 +++++++++++++++++++++++++++++++++++
src/iceberg/test/struct_like_test.cc | 51 ++++++++
src/iceberg/type_fwd.h | 3 +-
12 files changed, 564 insertions(+), 13 deletions(-)
diff --git a/src/iceberg/CMakeLists.txt b/src/iceberg/CMakeLists.txt
index c048d29..22c2221 100644
--- a/src/iceberg/CMakeLists.txt
+++ b/src/iceberg/CMakeLists.txt
@@ -43,6 +43,7 @@ set(ICEBERG_SOURCES
partition_spec.cc
row/arrow_array_wrapper.cc
row/manifest_wrapper.cc
+ row/struct_like.cc
schema.cc
schema_field.cc
schema_internal.cc
diff --git a/src/iceberg/expression/term.cc b/src/iceberg/expression/term.cc
index ba6e55e..34dfb91 100644
--- a/src/iceberg/expression/term.cc
+++ b/src/iceberg/expression/term.cc
@@ -21,8 +21,8 @@
#include <format>
-#include "iceberg/exception.h"
#include "iceberg/result.h"
+#include "iceberg/row/struct_like.h"
#include "iceberg/schema.h"
#include "iceberg/transform.h"
#include "iceberg/util/checked_cast.h"
@@ -64,7 +64,11 @@ Result<std::shared_ptr<BoundReference>>
NamedReference::Bind(const Schema& schem
return InvalidExpression("Cannot find field '{}' in struct: {}",
field_name_,
schema.ToString());
}
- return BoundReference::Make(field_opt.value().get());
+
+ int32_t field_id = field_opt.value().get().field_id();
+ ICEBERG_ASSIGN_OR_RAISE(auto accessor, schema.GetAccessorById(field_id));
+
+ return BoundReference::Make(field_opt.value().get(), std::move(accessor));
}
std::string NamedReference::ToString() const {
@@ -72,17 +76,25 @@ std::string NamedReference::ToString() const {
}
// BoundReference implementation
-Result<std::unique_ptr<BoundReference>> BoundReference::Make(SchemaField
field) {
+Result<std::unique_ptr<BoundReference>> BoundReference::Make(
+ SchemaField field, std::unique_ptr<StructLikeAccessor> accessor) {
if (auto status = field.Validate(); !status.has_value()) [[unlikely]] {
return InvalidExpression("Cannot create BoundReference with invalid field:
{}",
status.error().message);
}
- return std::unique_ptr<BoundReference>(new BoundReference(std::move(field)));
+ if (!accessor) [[unlikely]] {
+ return InvalidExpression("Cannot create BoundReference without accessor");
+ }
+ return std::unique_ptr<BoundReference>(
+ new BoundReference(std::move(field), std::move(accessor)));
}
-BoundReference::BoundReference(SchemaField field) : field_(std::move(field)) {
+BoundReference::BoundReference(SchemaField field,
+ std::unique_ptr<StructLikeAccessor> accessor)
+ : field_(std::move(field)), accessor_(std::move(accessor)) {
ICEBERG_DCHECK(field_.Validate().has_value(),
"Cannot create BoundReference with invalid field");
+ ICEBERG_DCHECK(accessor_ != nullptr, "Cannot create BoundReference without
accessor");
}
BoundReference::~BoundReference() = default;
@@ -92,7 +104,7 @@ std::string BoundReference::ToString() const {
}
Result<Literal> BoundReference::Evaluate(const StructLike& data) const {
- return NotImplemented("BoundReference::Evaluate(StructLike) not
implemented");
+ return accessor_->GetLiteral(data);
}
bool BoundReference::Equals(const BoundTerm& other) const {
@@ -167,14 +179,14 @@ std::string BoundTransform::ToString() const {
}
Result<Literal> BoundTransform::Evaluate(const StructLike& data) const {
- throw IcebergError("BoundTransform::Evaluate(StructLike) not implemented");
+ ICEBERG_ASSIGN_OR_RAISE(auto literal, ref_->Evaluate(data));
+ return transform_func_->Transform(literal);
}
bool BoundTransform::MayProduceNull() const {
// transforms must produce null for null input values
// transforms may produce null for non-null inputs when not order-preserving
- // FIXME: add Transform::is_order_preserving()
- return ref_->MayProduceNull(); // || !transform_->is_order_preserving();
+ return ref_->MayProduceNull() || !transform_->PreservesOrder();
}
std::shared_ptr<Type> BoundTransform::type() const {
diff --git a/src/iceberg/expression/term.h b/src/iceberg/expression/term.h
index 6259b82..e2a378f 100644
--- a/src/iceberg/expression/term.h
+++ b/src/iceberg/expression/term.h
@@ -163,7 +163,8 @@ class ICEBERG_EXPORT BoundReference
/// \brief Create a bound reference.
///
/// \param field The schema field
- static Result<std::unique_ptr<BoundReference>> Make(SchemaField field);
+ static Result<std::unique_ptr<BoundReference>> Make(
+ SchemaField field, std::unique_ptr<StructLikeAccessor> accessor);
~BoundReference() override;
@@ -186,9 +187,10 @@ class ICEBERG_EXPORT BoundReference
Kind kind() const override { return Kind::kReference; }
private:
- explicit BoundReference(SchemaField field);
+ BoundReference(SchemaField field, std::unique_ptr<StructLikeAccessor>
accessor);
SchemaField field_;
+ std::unique_ptr<StructLikeAccessor> accessor_;
};
/// \brief An unbound transform expression.
diff --git a/src/iceberg/meson.build b/src/iceberg/meson.build
index 3a4f888..ae5f8ba 100644
--- a/src/iceberg/meson.build
+++ b/src/iceberg/meson.build
@@ -65,6 +65,7 @@ iceberg_sources = files(
'partition_spec.cc',
'row/arrow_array_wrapper.cc',
'row/manifest_wrapper.cc',
+ 'row/struct_like.cc',
'schema.cc',
'schema_field.cc',
'schema_internal.cc',
diff --git a/src/iceberg/row/struct_like.cc b/src/iceberg/row/struct_like.cc
new file mode 100644
index 0000000..b0fb67f
--- /dev/null
+++ b/src/iceberg/row/struct_like.cc
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "iceberg/row/struct_like.h"
+
+#include <utility>
+
+#include "iceberg/result.h"
+#include "iceberg/util/checked_cast.h"
+#include "iceberg/util/formatter_internal.h"
+#include "iceberg/util/macros.h"
+
+namespace iceberg {
+
+StructLikeAccessor::StructLikeAccessor(std::shared_ptr<Type> type,
+ std::span<const size_t> position_path)
+ : type_(std::move(type)) {
+ if (position_path.size() == 1) {
+ accessor_ = [pos =
+ position_path[0]](const StructLike& struct_like) ->
Result<Scalar> {
+ return struct_like.GetField(pos);
+ };
+ } else if (position_path.size() == 2) {
+ accessor_ = [pos0 = position_path[0], pos1 = position_path[1]](
+ const StructLike& struct_like) -> Result<Scalar> {
+ ICEBERG_ASSIGN_OR_RAISE(auto first_level_field,
struct_like.GetField(pos0));
+ if
(!std::holds_alternative<std::shared_ptr<StructLike>>(first_level_field)) {
+ return InvalidSchema("Encountered non-struct in the position path
[{},{}]", pos0,
+ pos1);
+ }
+ return
std::get<std::shared_ptr<StructLike>>(first_level_field)->GetField(pos1);
+ };
+ } else if (!position_path.empty()) {
+ accessor_ = [position_path](const StructLike& struct_like) ->
Result<Scalar> {
+ std::vector<std::shared_ptr<StructLike>> backups;
+ const StructLike* current_struct_like = &struct_like;
+ for (size_t i = 0; i < position_path.size() - 1; ++i) {
+ ICEBERG_ASSIGN_OR_RAISE(auto field,
+
current_struct_like->GetField(position_path[i]));
+ if (!std::holds_alternative<std::shared_ptr<StructLike>>(field)) {
+ return InvalidSchema("Encountered non-struct in the position path
[{}]",
+ position_path);
+ }
+ backups.push_back(std::get<std::shared_ptr<StructLike>>(field));
+ current_struct_like = backups.back().get();
+ }
+ return current_struct_like->GetField(position_path.back());
+ };
+ } else {
+ accessor_ = [](const StructLike&) -> Result<Scalar> {
+ return Invalid("Cannot read StructLike with empty position path");
+ };
+ }
+}
+
+Result<Literal> StructLikeAccessor::GetLiteral(const StructLike& struct_like)
const {
+ if (!type_->is_primitive()) {
+ return NotSupported("Cannot get literal value for non-primitive type {}",
+ type_->ToString());
+ }
+
+ ICEBERG_ASSIGN_OR_RAISE(auto scalar, Get(struct_like));
+
+ if (std::holds_alternative<std::monostate>(scalar)) {
+ return Literal::Null(internal::checked_pointer_cast<PrimitiveType>(type_));
+ }
+
+ switch (type_->type_id()) {
+ case TypeId::kBoolean:
+ return Literal::Boolean(std::get<bool>(scalar));
+ case TypeId::kInt:
+ return Literal::Int(std::get<int32_t>(scalar));
+ case TypeId::kLong:
+ return Literal::Long(std::get<int64_t>(scalar));
+ case TypeId::kFloat:
+ return Literal::Float(std::get<float>(scalar));
+ case TypeId::kDouble:
+ return Literal::Double(std::get<double>(scalar));
+ case TypeId::kString:
+ return Literal::String(std::string(std::get<std::string_view>(scalar)));
+ case TypeId::kBinary: {
+ auto binary_data = std::get<std::string_view>(scalar);
+ return Literal::Binary(
+ std::vector<uint8_t>(binary_data.cbegin(), binary_data.cend()));
+ }
+ case TypeId::kDecimal: {
+ const auto& decimal_type = internal::checked_cast<const
DecimalType&>(*type_);
+ return Literal::Decimal(std::get<Decimal>(scalar).value(),
decimal_type.precision(),
+ decimal_type.scale());
+ }
+ case TypeId::kDate:
+ return Literal::Date(std::get<int32_t>(scalar));
+ case TypeId::kTime:
+ return Literal::Time(std::get<int64_t>(scalar));
+ case TypeId::kTimestamp:
+ return Literal::Timestamp(std::get<int64_t>(scalar));
+ case TypeId::kTimestampTz:
+ return Literal::TimestampTz(std::get<int64_t>(scalar));
+ case TypeId::kFixed: {
+ const auto& fixed_data = std::get<std::string_view>(scalar);
+ return Literal::Fixed(std::vector<uint8_t>(fixed_data.cbegin(),
fixed_data.cend()));
+ }
+ case TypeId::kUuid:
+ // TODO(gangwu): Implement UUID type
+ default:
+ return NotSupported("Cannot convert scalar to literal of type {}",
+ type_->ToString());
+ }
+
+ std::unreachable();
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/row/struct_like.h b/src/iceberg/row/struct_like.h
index 3093f75..4999da6 100644
--- a/src/iceberg/row/struct_like.h
+++ b/src/iceberg/row/struct_like.h
@@ -26,11 +26,13 @@
/// ManifestEntry. Note that they do not carry type information and should be
/// used in conjunction with the schema to get the type information.
+#include <functional>
#include <memory>
+#include <span>
#include <string_view>
#include <variant>
-#include <vector>
+#include "iceberg/expression/literal.h"
#include "iceberg/result.h"
#include "iceberg/type_fwd.h"
#include "iceberg/util/decimal.h"
@@ -96,4 +98,29 @@ class ICEBERG_EXPORT MapLike {
virtual size_t size() const = 0;
};
+/// \brief An accessor for a struct-like object.
+class ICEBERG_EXPORT StructLikeAccessor {
+ public:
+ explicit StructLikeAccessor(std::shared_ptr<Type> type,
+ std::span<const size_t> position_path);
+
+ /// \brief Get the scalar value at the given position.
+ Result<Scalar> Get(const StructLike& struct_like) const {
+ return accessor_(struct_like);
+ }
+
+ /// \brief Get the literal value at the given position.
+ ///
+ /// \return The literal value at the given position, or an error if it is
+ /// not a primitive type.
+ Result<Literal> GetLiteral(const StructLike& struct_like) const;
+
+ /// \brief Get the type of the value that this accessor is bound to.
+ const Type& type() const { return *type_; }
+
+ private:
+ std::shared_ptr<Type> type_;
+ std::function<Result<Scalar>(const StructLike&)> accessor_;
+};
+
} // namespace iceberg
diff --git a/src/iceberg/schema.cc b/src/iceberg/schema.cc
index bfb47b3..8719f22 100644
--- a/src/iceberg/schema.cc
+++ b/src/iceberg/schema.cc
@@ -22,9 +22,12 @@
#include <format>
#include <functional>
+#include "iceberg/result.h"
+#include "iceberg/row/struct_like.h"
#include "iceberg/schema_internal.h"
#include "iceberg/type.h"
#include "iceberg/util/formatter.h" // IWYU pragma: keep
+#include "iceberg/util/formatter_internal.h"
#include "iceberg/util/macros.h"
#include "iceberg/util/visit_type.h"
@@ -69,6 +72,48 @@ class NameToIdVisitor {
std::function<std::string(std::string_view)> quoting_func_;
};
+class PositionPathVisitor {
+ public:
+ Status Visit(const PrimitiveType& type) {
+ if (current_field_id_ == kUnassignedFieldId) {
+ return InvalidSchema("Current field id is not assigned, type: {}",
type.ToString());
+ }
+
+ if (auto ret = position_path_.try_emplace(current_field_id_,
current_path_);
+ !ret.second) {
+ return InvalidSchema("Duplicate field id found: {}, prev path: {}, curr
path: {}",
+ current_field_id_, ret.first->second,
current_path_);
+ }
+
+ return {};
+ }
+
+ Status Visit(const StructType& type) {
+ for (size_t i = 0; i < type.fields().size(); ++i) {
+ const auto& field = type.fields()[i];
+ current_field_id_ = field.field_id();
+ current_path_.push_back(i);
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(*field.type(), this));
+ current_path_.pop_back();
+ }
+ return {};
+ }
+
+ // Non-struct types are not supported yet, but it is not an error.
+ Status Visit(const ListType& type) { return {}; }
+ Status Visit(const MapType& type) { return {}; }
+
+ std::unordered_map<int32_t, std::vector<size_t>> Finish() {
+ return std::move(position_path_);
+ }
+
+ private:
+ constexpr static int32_t kUnassignedFieldId = -1;
+ int32_t current_field_id_ = kUnassignedFieldId;
+ std::vector<size_t> current_path_;
+ std::unordered_map<int32_t, std::vector<size_t>> position_path_;
+};
+
Schema::Schema(std::vector<SchemaField> fields, std::optional<int32_t>
schema_id)
: StructType(std::move(fields)), schema_id_(schema_id) {}
@@ -144,6 +189,27 @@ Result<std::optional<std::reference_wrapper<const
SchemaField>>> Schema::FindFie
return it->second;
}
+Result<std::unordered_map<int32_t, std::vector<size_t>>>
Schema::InitIdToPositionPath(
+ const Schema& self) {
+ PositionPathVisitor visitor;
+ ICEBERG_RETURN_UNEXPECTED(VisitTypeInline(self, &visitor));
+ return visitor.Finish();
+}
+
+Result<std::unique_ptr<StructLikeAccessor>> Schema::GetAccessorById(
+ int32_t field_id) const {
+ ICEBERG_ASSIGN_OR_RAISE(auto id_to_position_path,
id_to_position_path_.Get(*this));
+ if (auto it = id_to_position_path.get().find(field_id);
+ it != id_to_position_path.get().cend()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto field, FindFieldById(field_id));
+ if (!field.has_value()) {
+ return NotFound("Cannot get accessor for field id: {}", field_id);
+ }
+ return std::make_unique<StructLikeAccessor>(field.value().get().type(),
it->second);
+ }
+ return NotFound("Cannot get accessor for field id: {}", field_id);
+}
+
IdToFieldVisitor::IdToFieldVisitor(
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>&
id_to_field)
: id_to_field_(id_to_field) {}
diff --git a/src/iceberg/schema.h b/src/iceberg/schema.h
index 32914be..94a8764 100644
--- a/src/iceberg/schema.h
+++ b/src/iceberg/schema.h
@@ -75,6 +75,12 @@ class ICEBERG_EXPORT Schema : public StructType {
Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
int32_t field_id) const;
+ /// \brief Get the accessor to access the field by field id.
+ ///
+ /// \param field_id The id of the field to get the accessor for.
+ /// \return The accessor to access the field, or NotFound if the field is
not found.
+ Result<std::unique_ptr<StructLikeAccessor>> GetAccessorById(int32_t
field_id) const;
+
/// \brief Creates a projected schema from selected field names.
///
/// \param names Selected field names and nested names are dot-concatenated.
@@ -106,6 +112,8 @@ class ICEBERG_EXPORT Schema : public StructType {
InitNameToIdMap(const Schema&);
static Result<std::unordered_map<std::string, int32_t, StringHash,
std::equal_to<>>>
InitLowerCaseNameToIdMap(const Schema&);
+ static Result<std::unordered_map<int32_t, std::vector<size_t>>>
InitIdToPositionPath(
+ const Schema&);
const std::optional<int32_t> schema_id_;
/// Mapping from field id to field.
@@ -114,6 +122,8 @@ class ICEBERG_EXPORT Schema : public StructType {
Lazy<InitNameToIdMap> name_to_id_;
/// Mapping from lowercased field name to field id
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
+ /// Mapping from field id to (nested) position path to access the field.
+ Lazy<InitIdToPositionPath> id_to_position_path_;
};
} // namespace iceberg
diff --git a/src/iceberg/test/CMakeLists.txt b/src/iceberg/test/CMakeLists.txt
index f1eb77e..d82fe17 100644
--- a/src/iceberg/test/CMakeLists.txt
+++ b/src/iceberg/test/CMakeLists.txt
@@ -141,6 +141,12 @@ if(ICEBERG_BUILD_BUNDLE)
test_common.cc
in_memory_catalog_test.cc)
+ add_iceberg_test(eval_expr_test
+ USE_BUNDLE
+ SOURCES
+ eval_expr_test.cc
+ test_common.cc)
+
add_iceberg_test(parquet_test
USE_BUNDLE
SOURCES
diff --git a/src/iceberg/test/eval_expr_test.cc
b/src/iceberg/test/eval_expr_test.cc
new file mode 100644
index 0000000..880f1ff
--- /dev/null
+++ b/src/iceberg/test/eval_expr_test.cc
@@ -0,0 +1,245 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <arrow/c/bridge.h>
+#include <arrow/json/from_string.h>
+#include <arrow/type.h>
+#include <arrow/type_fwd.h>
+#include <gtest/gtest.h>
+
+#include "iceberg/arrow_c_data.h"
+#include "iceberg/arrow_c_data_guard_internal.h"
+#include "iceberg/expression/expression.h"
+#include "iceberg/expression/literal.h"
+#include "iceberg/expression/term.h"
+#include "iceberg/row/arrow_array_wrapper.h"
+#include "iceberg/schema.h"
+#include "iceberg/schema_internal.h"
+#include "iceberg/test/matchers.h"
+#include "iceberg/transform.h"
+#include "iceberg/type.h"
+
+namespace iceberg {
+
+class BoundExpressionTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ schema_ = std::make_unique<Schema>(std::vector<SchemaField>{
+ SchemaField::MakeOptional(1, "id", int32()),
+ SchemaField::MakeOptional(2, "name", string()),
+ SchemaField::MakeRequired(3, "timestamp_field", timestamp()),
+ SchemaField::MakeRequired(4, "string_field", string())});
+
+ arrow_data_type_ = ::arrow::struct_(
+ {::arrow::field("id", ::arrow::int32()), ::arrow::field("name",
::arrow::utf8()),
+ ::arrow::field("timestamp_field",
::arrow::timestamp(::arrow::TimeUnit::MICRO)),
+ ::arrow::field("string_field", ::arrow::utf8())});
+
+ arrow_array_ = ::arrow::json::ArrayFromJSONString(arrow_data_type_, R"([
+ {"id": 1, "name": "Alice", "timestamp_field": 1609459200000000,
"string_field": "hello_world"},
+ {"id": 2, "name": null, "timestamp_field": 1609459200000000,
"string_field": "hello_world"}
+ ])")
+ .ValueOrDie();
+
+ ASSERT_TRUE(::arrow::ExportType(*arrow_data_type_, &arrow_c_schema_).ok());
+ ASSERT_TRUE(::arrow::ExportArray(*arrow_array_, &arrow_c_array_).ok());
+ }
+
+ void TearDown() override {
+ if (arrow_c_schema_.release != nullptr) {
+ ArrowSchemaRelease(&arrow_c_schema_);
+ }
+ if (arrow_c_array_.release != nullptr) {
+ ArrowArrayRelease(&arrow_c_array_);
+ }
+ }
+
+ std::unique_ptr<Schema> schema_;
+ std::shared_ptr<::arrow::DataType> arrow_data_type_;
+ std::shared_ptr<::arrow::Array> arrow_array_;
+ ArrowSchema arrow_c_schema_;
+ ArrowArray arrow_c_array_;
+};
+
+TEST_F(BoundExpressionTest, EvaluateBoundReference) {
+ ICEBERG_UNWRAP_OR_FAIL(auto id_ref, NamedReference::Make("id"));
+ ICEBERG_UNWRAP_OR_FAIL(auto id_bound_ref,
+ id_ref->Bind(*schema_, /*case_sensitive=*/true));
+
+ ICEBERG_UNWRAP_OR_FAIL(auto name_ref, NamedReference::Make("name"));
+ ICEBERG_UNWRAP_OR_FAIL(auto name_bound_ref,
+ name_ref->Bind(*schema_, /*case_sensitive=*/true));
+
+ struct TestCase {
+ size_t row_index;
+ Literal expected_id;
+ Literal expected_name;
+ };
+
+ for (const auto& test_case : std::vector<TestCase>{
+ {.row_index = 0,
+ .expected_id = Literal::Int(1),
+ .expected_name = Literal::String("Alice")},
+ {.row_index = 1,
+ .expected_id = Literal::Int(2),
+ .expected_name = Literal::Null(string())},
+ }) {
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto struct_like,
+ ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_,
test_case.row_index));
+
+ ICEBERG_UNWRAP_OR_FAIL(auto id_literal,
id_bound_ref->Evaluate(*struct_like));
+ EXPECT_EQ(id_literal, test_case.expected_id);
+
+ ICEBERG_UNWRAP_OR_FAIL(auto name_literal,
name_bound_ref->Evaluate(*struct_like));
+ if (test_case.expected_name.IsNull()) {
+ EXPECT_TRUE(name_literal.IsNull());
+ } else {
+ EXPECT_EQ(name_literal, test_case.expected_name);
+ }
+ }
+}
+
+TEST_F(BoundExpressionTest, IdentityTransform) {
+ ICEBERG_UNWRAP_OR_FAIL(auto name_ref, NamedReference::Make("name"));
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto name_transform,
+ UnboundTransform::Make(std::move(name_ref), Transform::Identity()));
+ ICEBERG_UNWRAP_OR_FAIL(auto bound_transform,
+ name_transform->Bind(*schema_,
/*case_sensitive=*/true));
+
+ struct TestCase {
+ size_t row_index;
+ Literal expected_name;
+ };
+
+ for (const auto& test_case : std::vector<TestCase>{
+ {.row_index = 0, .expected_name = Literal::String("Alice")},
+ {.row_index = 1, .expected_name = Literal::Null(string())},
+ }) {
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto struct_like,
+ ArrowArrayStructLike::Make(arrow_c_schema_, arrow_c_array_,
test_case.row_index));
+ ICEBERG_UNWRAP_OR_FAIL(auto result,
bound_transform->Evaluate(*struct_like));
+ if (test_case.expected_name.IsNull()) {
+ EXPECT_TRUE(result.IsNull());
+ } else {
+ EXPECT_EQ(result, test_case.expected_name);
+ }
+ }
+}
+
+TEST_F(BoundExpressionTest, YearTransform) {
+ // Create and bind year transform
+ ICEBERG_UNWRAP_OR_FAIL(auto timestamp_ref,
NamedReference::Make("timestamp_field"));
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto unbound_transform,
+ UnboundTransform::Make(std::move(timestamp_ref), Transform::Year()));
+ ICEBERG_UNWRAP_OR_FAIL(auto bound_transform,
+ unbound_transform->Bind(*schema_,
/*case_sensitive=*/true));
+
+ // Test data: 2021-01-01 00:00:00 UTC = 1609459200000000 microseconds
+ ICEBERG_UNWRAP_OR_FAIL(auto struct_like,
+ ArrowArrayStructLike::Make(arrow_c_schema_,
arrow_c_array_, 0));
+
+ // Evaluate (2021)
+ ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like));
+ EXPECT_FALSE(result.IsNull());
+ EXPECT_EQ(std::get<int32_t>(result.value()), 2021); // Year value
+}
+
+TEST_F(BoundExpressionTest, MonthTransform) {
+ // Create and bind month transform
+ ICEBERG_UNWRAP_OR_FAIL(auto timestamp_ref,
NamedReference::Make("timestamp_field"));
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto unbound_transform,
+ UnboundTransform::Make(std::move(timestamp_ref), Transform::Month()));
+ ICEBERG_UNWRAP_OR_FAIL(auto bound_transform,
+ unbound_transform->Bind(*schema_,
/*case_sensitive=*/true));
+
+ // Test data: 2021-01-01
+ ICEBERG_UNWRAP_OR_FAIL(auto struct_like,
+ ArrowArrayStructLike::Make(arrow_c_schema_,
arrow_c_array_, 0));
+
+ // Evaluate (2021-01)
+ ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like));
+ EXPECT_FALSE(result.IsNull());
+ EXPECT_EQ(std::get<int32_t>(result.value()), 612); // Months since 1970-01
+}
+
+TEST_F(BoundExpressionTest, DayTransform) {
+ // Create and bind day transform
+ ICEBERG_UNWRAP_OR_FAIL(auto timestamp_ref,
NamedReference::Make("timestamp_field"));
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto unbound_transform,
+ UnboundTransform::Make(std::move(timestamp_ref), Transform::Day()));
+ ICEBERG_UNWRAP_OR_FAIL(auto bound_transform,
+ unbound_transform->Bind(*schema_,
/*case_sensitive=*/true));
+
+ // Test data: 2021-01-01
+ ICEBERG_UNWRAP_OR_FAIL(auto struct_like,
+ ArrowArrayStructLike::Make(arrow_c_schema_,
arrow_c_array_, 0));
+
+ // Evaluate
+ ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like));
+ EXPECT_FALSE(result.IsNull());
+ EXPECT_EQ(std::get<int32_t>(result.value()), 18628); // Days since
1970-01-01
+}
+
+TEST_F(BoundExpressionTest, BucketTransform) {
+ // Create and bind bucket[4] transform
+ ICEBERG_UNWRAP_OR_FAIL(auto string_ref,
NamedReference::Make("string_field"));
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto unbound_transform,
+ UnboundTransform::Make(std::move(string_ref), Transform::Bucket(4)));
+ ICEBERG_UNWRAP_OR_FAIL(auto bound_transform,
+ unbound_transform->Bind(*schema_,
/*case_sensitive=*/true));
+
+ // Test data: "hello_world"
+ ICEBERG_UNWRAP_OR_FAIL(auto struct_like,
+ ArrowArrayStructLike::Make(arrow_c_schema_,
arrow_c_array_, 0));
+
+ // Evaluate - verify result is in range [0, 3]
+ ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like));
+ EXPECT_FALSE(result.IsNull());
+ auto bucket_value = std::get<int32_t>(result.value());
+ EXPECT_GE(bucket_value, 0);
+ EXPECT_LT(bucket_value, 4);
+}
+
+TEST_F(BoundExpressionTest, TruncateTransform) {
+ // Create and bind truncate[5] transform
+ ICEBERG_UNWRAP_OR_FAIL(auto string_ref,
NamedReference::Make("string_field"));
+ ICEBERG_UNWRAP_OR_FAIL(
+ auto unbound_transform,
+ UnboundTransform::Make(std::move(string_ref), Transform::Truncate(5)));
+ ICEBERG_UNWRAP_OR_FAIL(auto bound_transform,
+ unbound_transform->Bind(*schema_,
/*case_sensitive=*/true));
+
+ // Test data: "hello_world"
+ ICEBERG_UNWRAP_OR_FAIL(auto struct_like,
+ ArrowArrayStructLike::Make(arrow_c_schema_,
arrow_c_array_, 0));
+
+ // Evaluate - "hello_world" truncated to 5 chars = "hello"
+ ICEBERG_UNWRAP_OR_FAIL(auto result, bound_transform->Evaluate(*struct_like));
+ EXPECT_FALSE(result.IsNull());
+ EXPECT_EQ(std::get<std::string>(result.value()), "hello");
+}
+
+} // namespace iceberg
diff --git a/src/iceberg/test/struct_like_test.cc
b/src/iceberg/test/struct_like_test.cc
index b18ab8c..3683ed2 100644
--- a/src/iceberg/test/struct_like_test.cc
+++ b/src/iceberg/test/struct_like_test.cc
@@ -20,6 +20,7 @@
#include <arrow/c/bridge.h>
#include <arrow/json/from_string.h>
#include <arrow/type.h>
+#include <arrow/type_fwd.h>
#include <arrow/util/decimal.h>
#include "iceberg/arrow_c_data_guard_internal.h"
@@ -27,8 +28,10 @@
#include "iceberg/manifest_reader_internal.h"
#include "iceberg/row/arrow_array_wrapper.h"
#include "iceberg/row/manifest_wrapper.h"
+#include "iceberg/schema.h"
#include "iceberg/schema_internal.h"
#include "iceberg/test/matchers.h"
+#include "iceberg/type.h"
namespace iceberg {
@@ -386,4 +389,52 @@ TEST(ArrowArrayStructLike, PrimitiveMap) {
}
}
+TEST(ArrowArrayStructLike, Accessor) {
+ Schema schema{std::vector<SchemaField>{
+ SchemaField::MakeOptional(1, "c1", int32()),
+ SchemaField::MakeOptional(
+ 2, "c2",
+ struct_({
+ SchemaField::MakeOptional(3, "c3", int32()),
+ SchemaField::MakeOptional(4, "c4",
+ struct_({
+ SchemaField::MakeOptional(5, "c5",
int32()),
+ })),
+ })),
+ }};
+
+ auto arrow_schema = ::arrow::struct_({
+ ::arrow::field("c1", ::arrow::int32()),
+ ::arrow::field("c2",
+ ::arrow::struct_({
+ ::arrow::field("c3", ::arrow::int32()),
+ ::arrow::field("c4", ::arrow::struct_({
+ ::arrow::field("c5",
::arrow::int32()),
+ })),
+ })),
+ });
+
+ auto arrow_array =
+ ::arrow::json::ArrayFromJSONString(
+ arrow_schema, R"([ {"c1": 1, "c2": {"c3": 3, "c4": {"c5": 5}}} ])")
+ .ValueOrDie();
+
+ ArrowSchema c_schema;
+ ArrowArray c_array;
+ internal::ArrowSchemaGuard schema_guard(&c_schema);
+ internal::ArrowArrayGuard array_guard(&c_array);
+ ASSERT_TRUE(::arrow::ExportType(*arrow_schema, &c_schema).ok());
+ ASSERT_TRUE(::arrow::ExportArray(*arrow_array, &c_array).ok());
+
+ ICEBERG_UNWRAP_OR_FAIL(auto struct_like,
ArrowArrayStructLike::Make(c_schema, c_array));
+
+ // Test nested accessors from 1 to 3 levels deep
+ for (int32_t field_id : {1, 3, 5}) {
+ ICEBERG_UNWRAP_OR_FAIL(auto accessor, schema.GetAccessorById(field_id));
+ ICEBERG_UNWRAP_OR_FAIL(auto scalar, accessor->Get(*struct_like));
+ ASSERT_TRUE(std::holds_alternative<int32_t>(scalar));
+ EXPECT_EQ(std::get<int32_t>(scalar), field_id);
+ }
+}
+
} // namespace iceberg
diff --git a/src/iceberg/type_fwd.h b/src/iceberg/type_fwd.h
index 5485d83..79b43f5 100644
--- a/src/iceberg/type_fwd.h
+++ b/src/iceberg/type_fwd.h
@@ -144,9 +144,10 @@ struct WriterOptions;
class Reader;
class Writer;
-class StructLike;
class ArrayLike;
class MapLike;
+class StructLike;
+class StructLikeAccessor;
class TableUpdate;
class TableRequirement;