lidavidm commented on code in PR #159: URL: https://github.com/apache/iceberg-cpp/pull/159#discussion_r2253468625
########## test/parquet_schema_test.cc: ########## @@ -17,50 +17,494 @@ * under the License. */ +#include <arrow/type.h> #include <gtest/gtest.h> +#include <parquet/arrow/reader.h> +#include <parquet/arrow/schema.h> #include <parquet/schema.h> -#include <parquet/types.h> +#include "iceberg/metadata_columns.h" #include "iceberg/parquet/parquet_schema_util_internal.h" +#include "iceberg/schema.h" +#include "matchers.h" namespace iceberg::parquet { namespace { -::parquet::schema::NodePtr MakeInt32Node(const std::string& name, int field_id = -1) { +constexpr std::string_view kParquetFieldIdKey = "PARQUET:field_id"; + +::parquet::schema::NodePtr MakeInt32Node(const std::string& name, int field_id = -1, + bool optional = true) { + return ::parquet::schema::PrimitiveNode::Make( + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::None(), ::parquet::Type::INT32, /*primitive_length=*/-1, + field_id); +} + +::parquet::schema::NodePtr MakeInt64Node(const std::string& name, int field_id = -1, + bool optional = true) { + return ::parquet::schema::PrimitiveNode::Make( + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::None(), ::parquet::Type::INT64, /*primitive_length=*/-1, + field_id); +} + +::parquet::schema::NodePtr MakeStringNode(const std::string& name, int field_id = -1, + bool optional = true) { + return ::parquet::schema::PrimitiveNode::Make( + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::String(), ::parquet::Type::BYTE_ARRAY, + /*primitive_length=*/-1, field_id); +} + +::parquet::schema::NodePtr MakeDoubleNode(const std::string& name, int field_id = -1, + bool optional = true) { + return ::parquet::schema::PrimitiveNode::Make( + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::None(), ::parquet::Type::DOUBLE, /*primitive_length=*/-1, + field_id); +} + +::parquet::schema::NodePtr MakeFloatNode(const std::string& name, int field_id = -1, + bool optional = true) { return ::parquet::schema::PrimitiveNode::Make( - name, ::parquet::Repetition::REQUIRED, ::parquet::LogicalType::None(), - ::parquet::Type::INT32, /*primitive_length=*/-1, field_id); + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + ::parquet::LogicalType::None(), ::parquet::Type::FLOAT, /*primitive_length=*/-1, + field_id); } ::parquet::schema::NodePtr MakeGroupNode(const std::string& name, const ::parquet::schema::NodeVector& fields, - int field_id = -1) { - return ::parquet::schema::GroupNode::Make(name, ::parquet::Repetition::REQUIRED, fields, - /*logical_type=*/nullptr, field_id); + int field_id = -1, bool optional = true) { + return ::parquet::schema::GroupNode::Make( + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + fields, /*logical_type=*/nullptr, field_id); } +::parquet::schema::NodePtr MakeListNode(const std::string& name, + const ::parquet::schema::NodePtr& element_node, + int field_id = -1, bool optional = true) { + auto list_group = ::parquet::schema::GroupNode::Make( + "element", ::parquet::Repetition::REPEATED, {element_node}); + return ::parquet::schema::GroupNode::Make( + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + {list_group}, ::parquet::LogicalType::List(), field_id); +} + +::parquet::schema::NodePtr MakeMapNode(const std::string& name, + const ::parquet::schema::NodePtr& key_node, + const ::parquet::schema::NodePtr& value_node, + int field_id = -1, bool optional = true) { + auto key_value_group = ::parquet::schema::GroupNode::Make( + "key_value", ::parquet::Repetition::REPEATED, {key_node, value_node}); + return ::parquet::schema::GroupNode::Make( + name, optional ? ::parquet::Repetition::OPTIONAL : ::parquet::Repetition::REQUIRED, + {key_value_group}, ::parquet::LogicalType::Map(), field_id); +} + +// Helper to create SchemaManifest from Parquet schema +::parquet::arrow::SchemaManifest MakeSchemaManifest( + const ::parquet::schema::NodePtr& parquet_schema) { + auto parquet_schema_descriptor = std::make_shared<::parquet::SchemaDescriptor>(); + parquet_schema_descriptor->Init(parquet_schema); + + ::parquet::arrow::SchemaManifest manifest; + auto status = ::parquet::arrow::SchemaManifest::Make( + parquet_schema_descriptor.get(), /*key_value_metadata=*/nullptr, + ::parquet::default_arrow_reader_properties(), &manifest); + if (!status.ok()) { + throw std::runtime_error("Failed to create SchemaManifest: " + status.ToString()); + } + return manifest; +} + +#define ASSERT_PROJECTED_FIELD(field_projection, index) \ + ASSERT_EQ(field_projection.kind, FieldProjection::Kind::kProjected); \ + ASSERT_EQ(std::get<1>(field_projection.from), index); + +#define ASSERT_PROJECTED_NULL_FIELD(field_projection) \ + ASSERT_EQ(field_projection.kind, FieldProjection::Kind::kNull); + } // namespace TEST(HasFieldIds, PrimitiveNode) { EXPECT_FALSE(HasFieldIds(MakeInt32Node("test_field"))); EXPECT_TRUE(HasFieldIds(MakeInt32Node("test_field", /*field_id=*/1))); } -TEST(HasFieldIds, GroupNode) { - auto group_node_without_field_id = - MakeGroupNode("test_group", {MakeInt32Node("c1"), MakeInt32Node("c2")}); - EXPECT_FALSE(HasFieldIds(group_node_without_field_id)); +// TEST(HasFieldIds, GroupNode) { Review Comment: Weird, I'm not sure what's going on here...NodePtr is a shared_ptr and this seems to work elsewhere... https://github.com/llvm/llvm-project/issues/122405 also seems to be related - maybe this check is a bit flaky. We can uncomment and use NOLINT on the specific line? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org