This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/iceberg-cpp.git


The following commit(s) were added to refs/heads/main by this push:
     new 646a1b9  feat(parquet): add HasFieldIds check (#158)
646a1b9 is described below

commit 646a1b915e8c55779634b68c1c01a41ba21a3c61
Author: Gang Wu <ust...@gmail.com>
AuthorDate: Mon Aug 4 18:19:40 2025 +0800

    feat(parquet): add HasFieldIds check (#158)
---
 src/iceberg/parquet/parquet_reader.cc              |  4 +-
 src/iceberg/parquet/parquet_schema_util.cc         | 20 ++++++-
 src/iceberg/parquet/parquet_schema_util_internal.h |  5 +-
 test/CMakeLists.txt                                |  2 +
 test/parquet_schema_test.cc                        | 66 ++++++++++++++++++++++
 5 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/src/iceberg/parquet/parquet_reader.cc 
b/src/iceberg/parquet/parquet_reader.cc
index 3ee260a..282ce5e 100644
--- a/src/iceberg/parquet/parquet_reader.cc
+++ b/src/iceberg/parquet/parquet_reader.cc
@@ -61,9 +61,7 @@ Result<SchemaProjection> 
BuildProjection(::parquet::arrow::FileReader* reader,
                                          const Schema& read_schema) {
   auto metadata = reader->parquet_reader()->metadata();
 
-  ICEBERG_ASSIGN_OR_RAISE(auto has_field_ids,
-                          HasFieldIds(metadata->schema()->schema_root()));
-  if (!has_field_ids) {
+  if (!HasFieldIds(metadata->schema()->schema_root())) {
     // TODO(gangwu): apply name mapping to Parquet schema
     return NotImplemented("Applying name mapping to Parquet schema is not 
implemented");
   }
diff --git a/src/iceberg/parquet/parquet_schema_util.cc 
b/src/iceberg/parquet/parquet_schema_util.cc
index 8648fa9..05c7122 100644
--- a/src/iceberg/parquet/parquet_schema_util.cc
+++ b/src/iceberg/parquet/parquet_schema_util.cc
@@ -17,7 +17,10 @@
  * under the License.
  */
 
+#include <parquet/schema.h>
+
 #include "iceberg/parquet/parquet_schema_util_internal.h"
+#include "iceberg/util/checked_cast.h"
 
 namespace iceberg::parquet {
 
@@ -30,8 +33,21 @@ Result<std::vector<int>> SelectedColumnIndices(const 
SchemaProjection& projectio
   return NotImplemented("NYI");
 }
 
-Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node) {
-  return NotImplemented("NYI");
+bool HasFieldIds(const ::parquet::schema::NodePtr& node) {
+  if (node->field_id() >= 0) {
+    return true;
+  }
+
+  if (node->is_group()) {
+    auto group_node = 
internal::checked_pointer_cast<::parquet::schema::GroupNode>(node);
+    for (int i = 0; i < group_node->field_count(); i++) {
+      if (HasFieldIds(group_node->field(i))) {
+        return true;
+      }
+    }
+  }
+
+  return false;
 }
 
 }  // namespace iceberg::parquet
diff --git a/src/iceberg/parquet/parquet_schema_util_internal.h 
b/src/iceberg/parquet/parquet_schema_util_internal.h
index f3b0f37..84e71ab 100644
--- a/src/iceberg/parquet/parquet_schema_util_internal.h
+++ b/src/iceberg/parquet/parquet_schema_util_internal.h
@@ -47,8 +47,7 @@ Result<std::vector<int>> SelectedColumnIndices(const 
SchemaProjection& projectio
 /// \brief Check whether the Parquet schema has field IDs.
 ///
 /// \param root_node The root node of the Parquet schema.
-/// \return True if the Parquet schema has field IDs, false otherwise. Return 
error if
-/// the Parquet schema has partial field IDs.
-Result<bool> HasFieldIds(const ::parquet::schema::NodePtr& root_node);
+/// \return True if the Parquet schema has field IDs, false otherwise.
+bool HasFieldIds(const ::parquet::schema::NodePtr& root_node);
 
 }  // namespace iceberg::parquet
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 51fc776..091fa29 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -116,4 +116,6 @@ if(ICEBERG_BUILD_BUNDLE)
                    SOURCES
                    test_common.cc
                    in_memory_catalog_test.cc)
+
+  add_iceberg_test(parquet_test USE_BUNDLE SOURCES parquet_schema_test.cc)
 endif()
diff --git a/test/parquet_schema_test.cc b/test/parquet_schema_test.cc
new file mode 100644
index 0000000..1a280ff
--- /dev/null
+++ b/test/parquet_schema_test.cc
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <parquet/schema.h>
+#include <parquet/types.h>
+
+#include "iceberg/parquet/parquet_schema_util_internal.h"
+
+namespace iceberg::parquet {
+
+namespace {
+
+::parquet::schema::NodePtr MakeInt32Node(const std::string& name, int field_id 
= -1) {
+  return ::parquet::schema::PrimitiveNode::Make(
+      name, ::parquet::Repetition::REQUIRED, ::parquet::LogicalType::None(),
+      ::parquet::Type::INT32, /*primitive_length=*/-1, field_id);
+}
+
+::parquet::schema::NodePtr MakeGroupNode(const std::string& name,
+                                         const ::parquet::schema::NodeVector& 
fields,
+                                         int field_id = -1) {
+  return ::parquet::schema::GroupNode::Make(name, 
::parquet::Repetition::REQUIRED, fields,
+                                            /*logical_type=*/nullptr, 
field_id);
+}
+
+}  // namespace
+
+TEST(HasFieldIds, PrimitiveNode) {
+  EXPECT_FALSE(HasFieldIds(MakeInt32Node("test_field")));
+  EXPECT_TRUE(HasFieldIds(MakeInt32Node("test_field", /*field_id=*/1)));
+}
+
+TEST(HasFieldIds, GroupNode) {
+  auto group_node_without_field_id =
+      MakeGroupNode("test_group", {MakeInt32Node("c1"), MakeInt32Node("c2")});
+  EXPECT_FALSE(HasFieldIds(group_node_without_field_id));
+
+  auto group_node_with_full_field_id = MakeGroupNode(
+      "test_group",
+      {MakeInt32Node("c1", /*field_id=*/2), MakeInt32Node("c2", 
/*field_id=*/3)},
+      /*field_id=*/1);
+  EXPECT_TRUE(HasFieldIds(group_node_with_full_field_id));
+
+  auto group_node_with_partial_field_id = MakeGroupNode(
+      "test_group", {MakeInt32Node("c1", /*field_id=*/1), 
MakeInt32Node("c2")});
+  EXPECT_TRUE(HasFieldIds(group_node_with_partial_field_id));
+}
+
+}  // namespace iceberg::parquet

Reply via email to