wgtmac commented on code in PR #207:
URL: https://github.com/apache/iceberg-cpp/pull/207#discussion_r2338321826
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ template <typename... Args>
+ Result<std::unique_ptr<const Schema>> Select(Args&&... names,
Review Comment:
```suggestion
Result<std::unique_ptr<Schema>> Select(Args&&... names,
```
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
Review Comment:
```suggestion
```
We have `std::span<const std::string>` as the input now, so this overload is
unnecessary.
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ template <typename... Args>
+ Result<std::unique_ptr<const Schema>> Select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert(((std::is_convertible_v<Args, std::string> ||
+ std::is_convertible_v<Args, std::string>) &&
Review Comment:
```suggestion
std::is_convertible_v<Args, std::string_view>) &&
```
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
Review Comment:
```suggestion
Result<std::unique_ptr<Schema>> Select(std::span<const std::string> names,
```
It is an overkill to return const here
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ template <typename... Args>
+ Result<std::unique_ptr<const Schema>> Select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert(((std::is_convertible_v<Args, std::string> ||
+ std::is_convertible_v<Args, std::string>) &&
+ ...),
+ "All arguments must be convertible to std::string");
Review Comment:
```suggestion
"All arguments must be convertible to std::string or
std::string_view");
```
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
Review Comment:
```suggestion
/// \brief Creates a projected schema from selected field names.
///
/// \param names Selected field names and nested names are
dot-concatenated.
/// \param case_sensitive Whether name matching is case-sensitive
(default: true).
/// \return Projected schema containing only selected fields.
/// \note If the field name of a nested type has been selected, all of its
/// sub-fields will be selected.
```
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ template <typename... Args>
+ Result<std::unique_ptr<const Schema>> Select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert(((std::is_convertible_v<Args, std::string> ||
+ std::is_convertible_v<Args, std::string>) &&
+ ...),
+ "All arguments must be convertible to std::string");
+ return select({(names)...}, case_sensitive);
+ }
+
+ /// \brief Creates a projected schema from selected field IDs.
+ ///
+ /// Selects fields by their numeric IDs. More efficient than Select() when
you
+ /// already know the field IDs. Handles recursive projection of nested
structs.
+ ///
+ /// \param field_ids Set of field IDs to select
+ /// \return Projected schema containing only the specified fields
+ ///
+ /// \note When a struct field ID is specified:
+ /// - If nested field IDs are also in field_ids, they are recursively
projected
+ /// - If no nested field IDs are in field_ids, an empty struct is
included
+ /// - List/Map types cannot be explicitly projected (returns error)
+ Result<std::unique_ptr<const Schema>> Project(
+ std::unordered_set<int32_t>& field_ids) const;
Review Comment:
```suggestion
Result<std::unique_ptr<Schema>> Project(
const std::unordered_set<int32_t>& field_ids) const;
```
Return value should not have `const` but the input parameter should.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,164 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. When `select_full_types` is true, a field with all
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const
{
+ switch (type->type_id()) {
+ case TypeId::kStruct: {
+ return Visit(internal::checked_pointer_cast<StructType>(type));
+ }
+ case TypeId::kList: {
+ return Visit(internal::checked_pointer_cast<ListType>(type));
+ }
+ case TypeId::kMap: {
+ return Visit(internal::checked_pointer_cast<MapType>(type));
+ }
+ default: {
+ return nullptr;
+ }
Review Comment:
```suggestion
case TypeId::kStruct:
return Visit(internal::checked_pointer_cast<StructType>(type));
case TypeId::kList:
return Visit(internal::checked_pointer_cast<ListType>(type));
case TypeId::kMap:
return Visit(internal::checked_pointer_cast<MapType>(type));
default:
return nullptr;
```
Less is more :)
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,164 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. When `select_full_types` is true, a field with all
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const
{
+ switch (type->type_id()) {
+ case TypeId::kStruct: {
+ return Visit(internal::checked_pointer_cast<StructType>(type));
+ }
+ case TypeId::kList: {
+ return Visit(internal::checked_pointer_cast<ListType>(type));
+ }
+ case TypeId::kMap: {
+ return Visit(internal::checked_pointer_cast<MapType>(type));
+ }
+ default: {
+ return nullptr;
+ }
+ }
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
+ if (selected_ids_.contains(field.field_id())) {
+ return (select_full_types_ || field.type()->is_primitive()) ?
field.type()
+ :
Visit(field.type());
+ }
+ return Visit(field.type());
+ }
+
+ static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type>
type) {
+ return {field.field_id(), std::string(field.name()), std::move(type),
+ field.optional(), std::string(field.doc())};
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type)
const {
+ bool same_types = true;
+ std::vector<SchemaField> selected_fields;
+ for (const auto& field : type->fields()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
+ if (child_type) {
+ same_types = same_types && (child_type == field.type());
+ selected_fields.emplace_back(MakeField(field, std::move(child_type)));
+ }
+ }
+
+ if (selected_fields.empty()) {
+ return nullptr;
+ } else if (same_types and selected_fields.size() == type->fields().size())
{
+ return type;
+ }
+ return std::make_shared<StructType>(std::move(selected_fields));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type)
const {
+ const auto& elem_field = type->fields()[0];
+ ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
+ if (elem_type == nullptr) {
+ return nullptr;
+ } else if (elem_type == elem_field.type()) {
+ return type;
+ }
+ return std::make_shared<ListType>(MakeField(elem_field,
std::move(elem_type)));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type)
const {
+ const auto& key_field = type->fields()[0];
+ const auto& value_field = type->fields()[1];
+ ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
+ ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
+
+ if (key_type == nullptr && value_type == nullptr) {
+ return nullptr;
+ } else if (value_type == value_field.type() &&
+ (key_type == key_field.type() || key_type == nullptr)) {
+ return type;
+ } else if (value_type == nullptr) {
+ return InvalidArgument("Cannot project Map without value field");
+ }
+ return std::make_shared<MapType>(
+ (key_type == nullptr ? key_field : MakeField(key_field,
std::move(key_type))),
+ MakeField(value_field, std::move(value_type)));
+ }
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::unique_ptr<const Schema>> Schema::Select(std::span<const
std::string> names,
+ bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::SelectInternal(
+ std::span<const std::string> names, bool case_sensitive) const {
+ const std::string kAllColumns = "*";
+ if (std::ranges::find(names, kAllColumns) != names.end()) {
+ return std::make_unique<Schema>(*this);
+ }
+
+ std::unordered_set<int32_t> selected_ids;
+ for (const auto& name : names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name,
case_sensitive));
+ if (result.has_value()) {
+ selected_ids.insert(result.value().get().field_id());
+ }
+ }
+
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+ auto self = std::shared_ptr<const StructType>(this, [](const StructType*)
{});
+ ICEBERG_ASSIGN_OR_RAISE(auto result,
+
visitor.Visit(std::const_pointer_cast<StructType>(self)));
+
+ if (!result) {
+ return std::make_unique<Schema>(std::vector<SchemaField>{}, schema_id_);
+ }
+
+ if (result->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ auto& projected_struct = internal::checked_cast<const StructType&>(*result);
+
+ return FromStructType(std::move(const_cast<StructType&>(projected_struct)),
schema_id_);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::Project(
+ std::unordered_set<int32_t>& field_ids) const {
+ PruneColumnVisitor visitor(field_ids, /*select_full_types=*/false);
+ auto self = std::shared_ptr<const StructType>(this, [](const StructType*)
{});
Review Comment:
Same comment as above.
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ template <typename... Args>
+ Result<std::unique_ptr<const Schema>> Select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert(((std::is_convertible_v<Args, std::string> ||
+ std::is_convertible_v<Args, std::string>) &&
+ ...),
+ "All arguments must be convertible to std::string");
+ return select({(names)...}, case_sensitive);
+ }
+
+ /// \brief Creates a projected schema from selected field IDs.
+ ///
+ /// Selects fields by their numeric IDs. More efficient than Select() when
you
+ /// already know the field IDs. Handles recursive projection of nested
structs.
+ ///
+ /// \param field_ids Set of field IDs to select
+ /// \return Projected schema containing only the specified fields
+ ///
+ /// \note When a struct field ID is specified:
+ /// - If nested field IDs are also in field_ids, they are recursively
projected
+ /// - If no nested field IDs are in field_ids, an empty struct is
included
+ /// - List/Map types cannot be explicitly projected (returns error)
+ Result<std::unique_ptr<const Schema>> Project(
+ std::unordered_set<int32_t>& field_ids) const;
friend bool operator==(const Schema& lhs, const Schema& rhs) { return
lhs.Equals(rhs); }
private:
/// \brief Compare two schemas for equality.
- [[nodiscard]] bool Equals(const Schema& other) const;
+ bool Equals(const Schema& other) const;
+
+ Result<std::unique_ptr<const Schema>> SelectInternal(std::span<const
std::string> names,
Review Comment:
```suggestion
Result<std::unique_ptr<Schema>> SelectInternal(std::span<const
std::string> names,
```
##########
test/schema_test.cc:
##########
@@ -491,3 +493,921 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) {
EXPECT_THAT(result.error().message,
::testing::HasSubstr("Duplicate field id found: 1"));
}
+
+struct SelectTestParam {
+ std::string test_name;
+ std::function<std::shared_ptr<iceberg::Schema>()> create_schema;
+ std::vector<std::string> select_fields;
+ std::function<std::shared_ptr<iceberg::Schema>()> expected_schema;
+ bool should_succeed;
+ std::string expected_error_message;
+ bool case_sensitive = true;
+};
+
+class SelectParamTest : public ::testing::TestWithParam<SelectTestParam> {};
+
+TEST_P(SelectParamTest, SelectFields) {
+ const auto& param = GetParam();
+ auto input_schema = param.create_schema();
+
+ auto result = input_schema->Select(param.select_fields,
param.case_sensitive);
+
+ if (param.should_succeed) {
+ ASSERT_TRUE(result.has_value());
+ auto actual_schema = std::move(result.value());
+ auto expected_schema = param.expected_schema();
+ ASSERT_EQ(*actual_schema, *expected_schema)
+ << "Schema mismatch for: " << param.test_name;
+ } else {
+ ASSERT_FALSE(result.has_value());
+ ASSERT_THAT(result,
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+
+ ASSERT_THAT(result,
iceberg::HasErrorMessage(param.expected_error_message));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ SelectTestCases, SelectParamTest,
+ ::testing::Values(
+ SelectTestParam{.test_name = "SelectAllColumns",
+ .create_schema =
+ []() {
+ auto field1 =
std::make_unique<iceberg::SchemaField>(
Review Comment:
We shouldn't make a unique_ptr and then only copy its data below. Instead,
we can:
```
SchemaField field1(1, "id", iceberg::int32(), false);
...
return std::make_shared<iceberg::Schema>(
std::vector<iceberg::SchemaField>{std::move(field1), ...});
```
##########
test/schema_test.cc:
##########
@@ -491,3 +493,921 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) {
EXPECT_THAT(result.error().message,
::testing::HasSubstr("Duplicate field id found: 1"));
}
+
+struct SelectTestParam {
+ std::string test_name;
+ std::function<std::shared_ptr<iceberg::Schema>()> create_schema;
+ std::vector<std::string> select_fields;
+ std::function<std::shared_ptr<iceberg::Schema>()> expected_schema;
+ bool should_succeed;
+ std::string expected_error_message;
+ bool case_sensitive = true;
+};
+
+class SelectParamTest : public ::testing::TestWithParam<SelectTestParam> {};
+
+TEST_P(SelectParamTest, SelectFields) {
+ const auto& param = GetParam();
+ auto input_schema = param.create_schema();
+
+ auto result = input_schema->Select(param.select_fields,
param.case_sensitive);
+
+ if (param.should_succeed) {
+ ASSERT_TRUE(result.has_value());
+ auto actual_schema = std::move(result.value());
+ auto expected_schema = param.expected_schema();
+ ASSERT_EQ(*actual_schema, *expected_schema)
+ << "Schema mismatch for: " << param.test_name;
+ } else {
+ ASSERT_FALSE(result.has_value());
+ ASSERT_THAT(result,
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+
Review Comment:
```suggestion
```
##########
test/schema_test.cc:
##########
@@ -491,3 +493,921 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) {
EXPECT_THAT(result.error().message,
::testing::HasSubstr("Duplicate field id found: 1"));
}
+
+struct SelectTestParam {
+ std::string test_name;
+ std::function<std::shared_ptr<iceberg::Schema>()> create_schema;
+ std::vector<std::string> select_fields;
+ std::function<std::shared_ptr<iceberg::Schema>()> expected_schema;
+ bool should_succeed;
+ std::string expected_error_message;
+ bool case_sensitive = true;
+};
+
+class SelectParamTest : public ::testing::TestWithParam<SelectTestParam> {};
+
+TEST_P(SelectParamTest, SelectFields) {
+ const auto& param = GetParam();
+ auto input_schema = param.create_schema();
+
+ auto result = input_schema->Select(param.select_fields,
param.case_sensitive);
+
+ if (param.should_succeed) {
+ ASSERT_TRUE(result.has_value());
+ auto actual_schema = std::move(result.value());
+ auto expected_schema = param.expected_schema();
+ ASSERT_EQ(*actual_schema, *expected_schema)
+ << "Schema mismatch for: " << param.test_name;
+ } else {
+ ASSERT_FALSE(result.has_value());
+ ASSERT_THAT(result,
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+
+ ASSERT_THAT(result,
iceberg::HasErrorMessage(param.expected_error_message));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ SelectTestCases, SelectParamTest,
+ ::testing::Values(
+ SelectTestParam{.test_name = "SelectAllColumns",
+ .create_schema =
Review Comment:
If the same schema has been used for multiple time, we can add a function
above to reuse. The point is to make these lines as short as possible.
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ template <typename... Args>
+ Result<std::unique_ptr<const Schema>> Select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert(((std::is_convertible_v<Args, std::string> ||
+ std::is_convertible_v<Args, std::string>) &&
+ ...),
+ "All arguments must be convertible to std::string");
+ return select({(names)...}, case_sensitive);
Review Comment:
We don't have test for this function, otherwise it must fail
##########
src/iceberg/schema.h:
##########
@@ -65,18 +66,63 @@ class ICEBERG_EXPORT Schema : public StructType {
/// canonical name 'm.value.x'
/// FIXME: Currently only handles ASCII lowercase conversion; extend to
support
/// non-ASCII characters (e.g., using std::towlower or ICU)
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldByName(std::string_view name, bool case_sensitive = true) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldByName(
+ std::string_view name, bool case_sensitive = true) const;
/// \brief Find the SchemaField by field id.
- [[nodiscard]] Result<std::optional<std::reference_wrapper<const
SchemaField>>>
- FindFieldById(int32_t field_id) const;
+ Result<std::optional<std::reference_wrapper<const SchemaField>>>
FindFieldById(
+ int32_t field_id) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ ///
+ /// Selects fields by their names using dot notation for nested fields.
+ /// Supports both canonical names (e.g., "user.address.street") and short
names
+ /// (e.g., "user.street" for map values, "list.element" for list elements).
+ ///
+ /// \param names Field names to select (supports nested field paths)
+ /// \param case_sensitive Whether name matching is case-sensitive (default:
true)
+ /// \return Projected schema containing only the specified fields
+ Result<std::unique_ptr<const Schema>> Select(std::span<const std::string>
names,
+ bool case_sensitive = true)
const;
+
+ /// \brief Creates a projected schema from selected field names.
+ Result<std::unique_ptr<const Schema>> Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive =
true) const;
+
+ /// \brief Creates a projected schema from selected field names.
+ template <typename... Args>
+ Result<std::unique_ptr<const Schema>> Select(Args&&... names,
+ bool case_sensitive = true)
const {
+ static_assert(((std::is_convertible_v<Args, std::string> ||
+ std::is_convertible_v<Args, std::string>) &&
+ ...),
+ "All arguments must be convertible to std::string");
+ return select({(names)...}, case_sensitive);
+ }
+
+ /// \brief Creates a projected schema from selected field IDs.
+ ///
+ /// Selects fields by their numeric IDs. More efficient than Select() when
you
+ /// already know the field IDs. Handles recursive projection of nested
structs.
+ ///
+ /// \param field_ids Set of field IDs to select
+ /// \return Projected schema containing only the specified fields
+ ///
+ /// \note When a struct field ID is specified:
+ /// - If nested field IDs are also in field_ids, they are recursively
projected
+ /// - If no nested field IDs are in field_ids, an empty struct is
included
+ /// - List/Map types cannot be explicitly projected (returns error)
Review Comment:
```suggestion
/// \brief Creates a projected schema from selected field IDs.
///
/// \param field_ids Set of field IDs to project.
/// \return Projected schema containing only selected field IDs.
/// \note Field ID of a nested field may not be projected unless at least
/// one of its sub-fields has been projected.
```
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,164 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. When `select_full_types` is true, a field with all
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const
{
+ switch (type->type_id()) {
+ case TypeId::kStruct: {
+ return Visit(internal::checked_pointer_cast<StructType>(type));
+ }
+ case TypeId::kList: {
+ return Visit(internal::checked_pointer_cast<ListType>(type));
+ }
+ case TypeId::kMap: {
+ return Visit(internal::checked_pointer_cast<MapType>(type));
+ }
+ default: {
+ return nullptr;
+ }
+ }
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
+ if (selected_ids_.contains(field.field_id())) {
+ return (select_full_types_ || field.type()->is_primitive()) ?
field.type()
+ :
Visit(field.type());
+ }
+ return Visit(field.type());
+ }
+
+ static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type>
type) {
+ return {field.field_id(), std::string(field.name()), std::move(type),
+ field.optional(), std::string(field.doc())};
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type)
const {
+ bool same_types = true;
+ std::vector<SchemaField> selected_fields;
+ for (const auto& field : type->fields()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
+ if (child_type) {
+ same_types = same_types && (child_type == field.type());
+ selected_fields.emplace_back(MakeField(field, std::move(child_type)));
+ }
+ }
+
+ if (selected_fields.empty()) {
+ return nullptr;
+ } else if (same_types and selected_fields.size() == type->fields().size())
{
+ return type;
+ }
+ return std::make_shared<StructType>(std::move(selected_fields));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type)
const {
+ const auto& elem_field = type->fields()[0];
+ ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
+ if (elem_type == nullptr) {
+ return nullptr;
+ } else if (elem_type == elem_field.type()) {
+ return type;
+ }
+ return std::make_shared<ListType>(MakeField(elem_field,
std::move(elem_type)));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type)
const {
+ const auto& key_field = type->fields()[0];
+ const auto& value_field = type->fields()[1];
+ ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
+ ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
+
+ if (key_type == nullptr && value_type == nullptr) {
+ return nullptr;
+ } else if (value_type == value_field.type() &&
+ (key_type == key_field.type() || key_type == nullptr)) {
+ return type;
+ } else if (value_type == nullptr) {
+ return InvalidArgument("Cannot project Map without value field");
+ }
+ return std::make_shared<MapType>(
+ (key_type == nullptr ? key_field : MakeField(key_field,
std::move(key_type))),
+ MakeField(value_field, std::move(value_type)));
+ }
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::unique_ptr<const Schema>> Schema::Select(std::span<const
std::string> names,
+ bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::SelectInternal(
Review Comment:
Can we remove `SelectInternal` now? We can remove the `Select` function
overload with `std::initializer_list`.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,164 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. When `select_full_types` is true, a field with all
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const
{
+ switch (type->type_id()) {
+ case TypeId::kStruct: {
+ return Visit(internal::checked_pointer_cast<StructType>(type));
+ }
+ case TypeId::kList: {
+ return Visit(internal::checked_pointer_cast<ListType>(type));
+ }
+ case TypeId::kMap: {
+ return Visit(internal::checked_pointer_cast<MapType>(type));
+ }
+ default: {
+ return nullptr;
+ }
+ }
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
+ if (selected_ids_.contains(field.field_id())) {
+ return (select_full_types_ || field.type()->is_primitive()) ?
field.type()
+ :
Visit(field.type());
+ }
+ return Visit(field.type());
+ }
+
+ static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type>
type) {
+ return {field.field_id(), std::string(field.name()), std::move(type),
+ field.optional(), std::string(field.doc())};
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type)
const {
+ bool same_types = true;
+ std::vector<SchemaField> selected_fields;
+ for (const auto& field : type->fields()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
+ if (child_type) {
+ same_types = same_types && (child_type == field.type());
+ selected_fields.emplace_back(MakeField(field, std::move(child_type)));
+ }
+ }
+
+ if (selected_fields.empty()) {
+ return nullptr;
+ } else if (same_types and selected_fields.size() == type->fields().size())
{
+ return type;
+ }
+ return std::make_shared<StructType>(std::move(selected_fields));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type)
const {
+ const auto& elem_field = type->fields()[0];
+ ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
+ if (elem_type == nullptr) {
+ return nullptr;
+ } else if (elem_type == elem_field.type()) {
+ return type;
+ }
+ return std::make_shared<ListType>(MakeField(elem_field,
std::move(elem_type)));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type)
const {
+ const auto& key_field = type->fields()[0];
+ const auto& value_field = type->fields()[1];
+ ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
+ ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
+
+ if (key_type == nullptr && value_type == nullptr) {
+ return nullptr;
+ } else if (value_type == value_field.type() &&
+ (key_type == key_field.type() || key_type == nullptr)) {
+ return type;
+ } else if (value_type == nullptr) {
+ return InvalidArgument("Cannot project Map without value field");
+ }
+ return std::make_shared<MapType>(
+ (key_type == nullptr ? key_field : MakeField(key_field,
std::move(key_type))),
+ MakeField(value_field, std::move(value_type)));
+ }
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
Review Comment:
```suggestion
const bool select_full_types_;
```
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,164 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. When `select_full_types` is true, a field with all
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const
{
+ switch (type->type_id()) {
+ case TypeId::kStruct: {
+ return Visit(internal::checked_pointer_cast<StructType>(type));
+ }
+ case TypeId::kList: {
+ return Visit(internal::checked_pointer_cast<ListType>(type));
+ }
+ case TypeId::kMap: {
+ return Visit(internal::checked_pointer_cast<MapType>(type));
+ }
+ default: {
+ return nullptr;
+ }
+ }
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
+ if (selected_ids_.contains(field.field_id())) {
+ return (select_full_types_ || field.type()->is_primitive()) ?
field.type()
+ :
Visit(field.type());
+ }
+ return Visit(field.type());
+ }
+
+ static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type>
type) {
+ return {field.field_id(), std::string(field.name()), std::move(type),
+ field.optional(), std::string(field.doc())};
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type)
const {
+ bool same_types = true;
+ std::vector<SchemaField> selected_fields;
+ for (const auto& field : type->fields()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
+ if (child_type) {
+ same_types = same_types && (child_type == field.type());
+ selected_fields.emplace_back(MakeField(field, std::move(child_type)));
+ }
+ }
+
+ if (selected_fields.empty()) {
+ return nullptr;
+ } else if (same_types and selected_fields.size() == type->fields().size())
{
+ return type;
+ }
+ return std::make_shared<StructType>(std::move(selected_fields));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type)
const {
+ const auto& elem_field = type->fields()[0];
+ ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
+ if (elem_type == nullptr) {
+ return nullptr;
+ } else if (elem_type == elem_field.type()) {
+ return type;
+ }
+ return std::make_shared<ListType>(MakeField(elem_field,
std::move(elem_type)));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type)
const {
+ const auto& key_field = type->fields()[0];
+ const auto& value_field = type->fields()[1];
+ ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
+ ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
+
+ if (key_type == nullptr && value_type == nullptr) {
+ return nullptr;
+ } else if (value_type == value_field.type() &&
+ (key_type == key_field.type() || key_type == nullptr)) {
+ return type;
+ } else if (value_type == nullptr) {
+ return InvalidArgument("Cannot project Map without value field");
+ }
+ return std::make_shared<MapType>(
+ (key_type == nullptr ? key_field : MakeField(key_field,
std::move(key_type))),
+ MakeField(value_field, std::move(value_type)));
+ }
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::unique_ptr<const Schema>> Schema::Select(std::span<const
std::string> names,
+ bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::SelectInternal(
+ std::span<const std::string> names, bool case_sensitive) const {
+ const std::string kAllColumns = "*";
+ if (std::ranges::find(names, kAllColumns) != names.end()) {
+ return std::make_unique<Schema>(*this);
+ }
+
+ std::unordered_set<int32_t> selected_ids;
+ for (const auto& name : names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name,
case_sensitive));
+ if (result.has_value()) {
+ selected_ids.insert(result.value().get().field_id());
+ }
+ }
+
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+ auto self = std::shared_ptr<const StructType>(this, [](const StructType*)
{});
+ ICEBERG_ASSIGN_OR_RAISE(auto result,
+
visitor.Visit(std::const_pointer_cast<StructType>(self)));
+
+ if (!result) {
+ return std::make_unique<Schema>(std::vector<SchemaField>{}, schema_id_);
+ }
+
+ if (result->type_id() != TypeId::kStruct) {
+ return InvalidSchema("Projected type must be a struct type");
+ }
+
+ auto& projected_struct = internal::checked_cast<const StructType&>(*result);
+
Review Comment:
Why there is a blank line but line 417 does not have one? I think we can
have a one-liner for 397 and 399.
##########
src/iceberg/schema.cc:
##########
@@ -257,4 +258,164 @@ void NameToIdVisitor::Finish() {
}
}
+/// \brief Visitor class for pruning schema columns based on selected field
IDs.
+///
+/// This visitor traverses a schema and creates a projected version containing
only
+/// the specified fields. When `select_full_types` is true, a field with all
its
+/// sub-fields are selected if its field-id has been selected; otherwise, only
leaf
+/// fields of selected field-ids are selected.
+///
+/// \note It returns an error when projection is not successful.
+class PruneColumnVisitor {
+ public:
+ PruneColumnVisitor(const std::unordered_set<int32_t>& selected_ids,
+ bool select_full_types)
+ : selected_ids_(selected_ids), select_full_types_(select_full_types) {}
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<Type>& type) const
{
+ switch (type->type_id()) {
+ case TypeId::kStruct: {
+ return Visit(internal::checked_pointer_cast<StructType>(type));
+ }
+ case TypeId::kList: {
+ return Visit(internal::checked_pointer_cast<ListType>(type));
+ }
+ case TypeId::kMap: {
+ return Visit(internal::checked_pointer_cast<MapType>(type));
+ }
+ default: {
+ return nullptr;
+ }
+ }
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const SchemaField& field) const {
+ if (selected_ids_.contains(field.field_id())) {
+ return (select_full_types_ || field.type()->is_primitive()) ?
field.type()
+ :
Visit(field.type());
+ }
+ return Visit(field.type());
+ }
+
+ static SchemaField MakeField(const SchemaField& field, std::shared_ptr<Type>
type) {
+ return {field.field_id(), std::string(field.name()), std::move(type),
+ field.optional(), std::string(field.doc())};
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<StructType>& type)
const {
+ bool same_types = true;
+ std::vector<SchemaField> selected_fields;
+ for (const auto& field : type->fields()) {
+ ICEBERG_ASSIGN_OR_RAISE(auto child_type, Visit(field));
+ if (child_type) {
+ same_types = same_types && (child_type == field.type());
+ selected_fields.emplace_back(MakeField(field, std::move(child_type)));
+ }
+ }
+
+ if (selected_fields.empty()) {
+ return nullptr;
+ } else if (same_types and selected_fields.size() == type->fields().size())
{
+ return type;
+ }
+ return std::make_shared<StructType>(std::move(selected_fields));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<ListType>& type)
const {
+ const auto& elem_field = type->fields()[0];
+ ICEBERG_ASSIGN_OR_RAISE(auto elem_type, Visit(elem_field));
+ if (elem_type == nullptr) {
+ return nullptr;
+ } else if (elem_type == elem_field.type()) {
+ return type;
+ }
+ return std::make_shared<ListType>(MakeField(elem_field,
std::move(elem_type)));
+ }
+
+ Result<std::shared_ptr<Type>> Visit(const std::shared_ptr<MapType>& type)
const {
+ const auto& key_field = type->fields()[0];
+ const auto& value_field = type->fields()[1];
+ ICEBERG_ASSIGN_OR_RAISE(auto key_type, Visit(key_field));
+ ICEBERG_ASSIGN_OR_RAISE(auto value_type, Visit(value_field));
+
+ if (key_type == nullptr && value_type == nullptr) {
+ return nullptr;
+ } else if (value_type == value_field.type() &&
+ (key_type == key_field.type() || key_type == nullptr)) {
+ return type;
+ } else if (value_type == nullptr) {
+ return InvalidArgument("Cannot project Map without value field");
+ }
+ return std::make_shared<MapType>(
+ (key_type == nullptr ? key_field : MakeField(key_field,
std::move(key_type))),
+ MakeField(value_field, std::move(value_type)));
+ }
+
+ private:
+ const std::unordered_set<int32_t>& selected_ids_;
+ bool select_full_types_;
+};
+
+Result<std::unique_ptr<const Schema>> Schema::Select(std::span<const
std::string> names,
+ bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::Select(
+ const std::initializer_list<std::string>& names, bool case_sensitive)
const {
+ return SelectInternal(names, case_sensitive);
+}
+
+Result<std::unique_ptr<const Schema>> Schema::SelectInternal(
+ std::span<const std::string> names, bool case_sensitive) const {
+ const std::string kAllColumns = "*";
+ if (std::ranges::find(names, kAllColumns) != names.end()) {
+ return std::make_unique<Schema>(*this);
+ }
+
+ std::unordered_set<int32_t> selected_ids;
+ for (const auto& name : names) {
+ ICEBERG_ASSIGN_OR_RAISE(auto result, FindFieldByName(name,
case_sensitive));
+ if (result.has_value()) {
+ selected_ids.insert(result.value().get().field_id());
+ }
+ }
+
+ PruneColumnVisitor visitor(selected_ids, /*select_full_types=*/true);
+ auto self = std::shared_ptr<const StructType>(this, [](const StructType*)
{});
+ ICEBERG_ASSIGN_OR_RAISE(auto result,
+
visitor.Visit(std::const_pointer_cast<StructType>(self)));
Review Comment:
Please do not hack like this. You may add a function
`std::unique_ptr<StructType> ToStructType(const Schema& schema);` to
`iceberg/schema_internal.h` by copying all fields to the struct type and return.
##########
test/schema_test.cc:
##########
@@ -491,3 +493,921 @@ TEST(SchemaTest, NestedDuplicateFieldIdError) {
EXPECT_THAT(result.error().message,
::testing::HasSubstr("Duplicate field id found: 1"));
}
+
+struct SelectTestParam {
+ std::string test_name;
+ std::function<std::shared_ptr<iceberg::Schema>()> create_schema;
+ std::vector<std::string> select_fields;
+ std::function<std::shared_ptr<iceberg::Schema>()> expected_schema;
+ bool should_succeed;
+ std::string expected_error_message;
+ bool case_sensitive = true;
+};
+
+class SelectParamTest : public ::testing::TestWithParam<SelectTestParam> {};
+
+TEST_P(SelectParamTest, SelectFields) {
+ const auto& param = GetParam();
+ auto input_schema = param.create_schema();
+
+ auto result = input_schema->Select(param.select_fields,
param.case_sensitive);
+
+ if (param.should_succeed) {
+ ASSERT_TRUE(result.has_value());
+ auto actual_schema = std::move(result.value());
+ auto expected_schema = param.expected_schema();
+ ASSERT_EQ(*actual_schema, *expected_schema)
+ << "Schema mismatch for: " << param.test_name;
+ } else {
+ ASSERT_FALSE(result.has_value());
+ ASSERT_THAT(result,
iceberg::IsError(iceberg::ErrorKind::kInvalidArgument));
+
+ ASSERT_THAT(result,
iceberg::HasErrorMessage(param.expected_error_message));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ SelectTestCases, SelectParamTest,
+ ::testing::Values(
+ SelectTestParam{.test_name = "SelectAllColumns",
+ .create_schema =
+ []() {
+ auto field1 =
std::make_unique<iceberg::SchemaField>(
Review Comment:
If multiple fields are used, perhaps we can also do this to reuse them in
all test cases:
```
static SchemaField MakeId() {
return {1, "id", iceberg::int32(), false};
}
static SchemaField MakeXXX() {
return ...
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]