This is an automated email from the ASF dual-hosted git repository. fokko pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git
The following commit(s) were added to refs/heads/main by this push: new 646a1b9 feat(parquet): add HasFieldIds check (#158) 646a1b9 is described below commit 646a1b915e8c55779634b68c1c01a41ba21a3c61 Author: Gang Wu <ust...@gmail.com> AuthorDate: Mon Aug 4 18:19:40 2025 +0800 feat(parquet): add HasFieldIds check (#158) --- src/iceberg/parquet/parquet_reader.cc | 4 +- src/iceberg/parquet/parquet_schema_util.cc | 20 ++++++- src/iceberg/parquet/parquet_schema_util_internal.h | 5 +- test/CMakeLists.txt | 2 + test/parquet_schema_test.cc | 66 ++++++++++++++++++++++ 5 files changed, 89 insertions(+), 8 deletions(-) diff --git a/src/iceberg/parquet/parquet_reader.cc b/src/iceberg/parquet/parquet_reader.cc index 3ee260a..282ce5e 100644 --- a/src/iceberg/parquet/parquet_reader.cc +++ b/src/iceberg/parquet/parquet_reader.cc @@ -61,9 +61,7 @@ Result<SchemaProjection> BuildProjection(::parquet::arrow::FileReader* reader, const Schema& read_schema) { auto metadata = reader->parquet_reader()->metadata(); - ICEBERG_ASSIGN_OR_RAISE(auto has_field_ids, - HasFieldIds(metadata->schema()->schema_root())); - if (!has_field_ids) { + if (!HasFieldIds(metadata->schema()->schema_root())) { // TODO(gangwu): apply name mapping to Parquet schema return NotImplemented("Applying name mapping to Parquet schema is not implemented"); } diff --git a/src/iceberg/parquet/parquet_schema_util.cc b/src/iceberg/parquet/parquet_schema_util.cc index 8648fa9..05c7122 100644 --- a/src/iceberg/parquet/parquet_schema_util.cc +++ b/src/iceberg/parquet/parquet_schema_util.cc @@ -17,7 +17,10 @@ * under the License. */ +#include <parquet/schema.h> + #include "iceberg/parquet/parquet_schema_util_internal.h" +#include "iceberg/util/checked_cast.h" namespace iceberg::parquet { @@ -30,8 +33,21 @@ Result<std::vector<int>> SelectedColumnIndices(const SchemaProjection& projectio return NotImplemented("NYI"); } -Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node) { - return NotImplemented("NYI"); +bool HasFieldIds(const ::parquet::schema::NodePtr& node) { + if (node->field_id() >= 0) { + return true; + } + + if (node->is_group()) { + auto group_node = internal::checked_pointer_cast<::parquet::schema::GroupNode>(node); + for (int i = 0; i < group_node->field_count(); i++) { + if (HasFieldIds(group_node->field(i))) { + return true; + } + } + } + + return false; } } // namespace iceberg::parquet diff --git a/src/iceberg/parquet/parquet_schema_util_internal.h b/src/iceberg/parquet/parquet_schema_util_internal.h index f3b0f37..84e71ab 100644 --- a/src/iceberg/parquet/parquet_schema_util_internal.h +++ b/src/iceberg/parquet/parquet_schema_util_internal.h @@ -47,8 +47,7 @@ Result<std::vector<int>> SelectedColumnIndices(const SchemaProjection& projectio /// \brief Check whether the Parquet schema has field IDs. /// /// \param root_node The root node of the Parquet schema. -/// \return True if the Parquet schema has field IDs, false otherwise. Return error if -/// the Parquet schema has partial field IDs. -Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node); +/// \return True if the Parquet schema has field IDs, false otherwise. +bool HasFieldIds(const ::parquet::schema::NodePtr& root_node); } // namespace iceberg::parquet diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 51fc776..091fa29 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -116,4 +116,6 @@ if(ICEBERG_BUILD_BUNDLE) SOURCES test_common.cc in_memory_catalog_test.cc) + + add_iceberg_test(parquet_test USE_BUNDLE SOURCES parquet_schema_test.cc) endif() diff --git a/test/parquet_schema_test.cc b/test/parquet_schema_test.cc new file mode 100644 index 0000000..1a280ff --- /dev/null +++ b/test/parquet_schema_test.cc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <gtest/gtest.h> +#include <parquet/schema.h> +#include <parquet/types.h> + +#include "iceberg/parquet/parquet_schema_util_internal.h" + +namespace iceberg::parquet { + +namespace { + +::parquet::schema::NodePtr MakeInt32Node(const std::string& name, int field_id = -1) { + return ::parquet::schema::PrimitiveNode::Make( + name, ::parquet::Repetition::REQUIRED, ::parquet::LogicalType::None(), + ::parquet::Type::INT32, /*primitive_length=*/-1, field_id); +} + +::parquet::schema::NodePtr MakeGroupNode(const std::string& name, + const ::parquet::schema::NodeVector& fields, + int field_id = -1) { + return ::parquet::schema::GroupNode::Make(name, ::parquet::Repetition::REQUIRED, fields, + /*logical_type=*/nullptr, field_id); +} + +} // namespace + +TEST(HasFieldIds, PrimitiveNode) { + EXPECT_FALSE(HasFieldIds(MakeInt32Node("test_field"))); + EXPECT_TRUE(HasFieldIds(MakeInt32Node("test_field", /*field_id=*/1))); +} + +TEST(HasFieldIds, GroupNode) { + auto group_node_without_field_id = + MakeGroupNode("test_group", {MakeInt32Node("c1"), MakeInt32Node("c2")}); + EXPECT_FALSE(HasFieldIds(group_node_without_field_id)); + + auto group_node_with_full_field_id = MakeGroupNode( + "test_group", + {MakeInt32Node("c1", /*field_id=*/2), MakeInt32Node("c2", /*field_id=*/3)}, + /*field_id=*/1); + EXPECT_TRUE(HasFieldIds(group_node_with_full_field_id)); + + auto group_node_with_partial_field_id = MakeGroupNode( + "test_group", {MakeInt32Node("c1", /*field_id=*/1), MakeInt32Node("c2")}); + EXPECT_TRUE(HasFieldIds(group_node_with_partial_field_id)); +} + +} // namespace iceberg::parquet