pitrou commented on code in PR #47242: URL: https://github.com/apache/arrow/pull/47242#discussion_r2297533444
########## cpp/src/parquet/arrow/arrow_schema_test.cc: ########## @@ -941,46 +942,61 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { PrimitiveNode::Make("metadata", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); auto value = PrimitiveNode::Make("value", Repetition::REQUIRED, ParquetType::BYTE_ARRAY); - - auto variant = - GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, {metadata, value}); + auto variant = GroupNode::Make("variant_unshredded", Repetition::OPTIONAL, + {metadata, value}, LogicalType::Variant()); parquet_fields.push_back(variant); - { - // Test converting from parquet schema to arrow schema. - std::vector<std::shared_ptr<Field>> arrow_fields; - auto arrow_metadata = - ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); - auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); - auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); - arrow_fields.push_back( - ::arrow::field("variant_unshredded", arrow_variant, /*nullable=*/true)); - auto arrow_schema = ::arrow::schema(arrow_fields); + // Arrow schema for unshredded variant struct. + auto arrow_metadata = ::arrow::field("metadata", ::arrow::binary(), /*nullable=*/false); + auto arrow_value = ::arrow::field("value", ::arrow::binary(), /*nullable=*/false); + auto arrow_variant = ::arrow::struct_({arrow_metadata, arrow_value}); + auto variant_extension = std::make_shared<VariantExtensionType>(arrow_variant); + { + // Parquet file does not contain Arrow schema. + // By default, field should be treated as a normal struct in Arrow. + auto arrow_schema = + ::arrow::schema({::arrow::field("variant_unshredded", arrow_variant)}); ASSERT_OK(ConvertSchema(parquet_fields)); - ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); } - { - // Test converting from parquet schema to arrow schema even though + for (bool register_extension : {true, false}) { + ::arrow::ExtensionTypeGuard guard(register_extension + ? ::arrow::DataTypeVector{variant_extension} + : ::arrow::DataTypeVector{}); + + // Parquet file does not contain Arrow schema. + // If Arrow extensions are enabled, field should be interpreted as Parquet Variant + // extension type if registered. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + + auto arrow_schema = ::arrow::schema({::arrow::field( + "variant_unshredded", register_extension ? variant_extension : arrow_variant)}); + + ASSERT_OK(ConvertSchema(parquet_fields, /*metadata=*/nullptr, props)); + ASSERT_NO_FATAL_FAILURE(CheckFlatSchema(arrow_schema, /*check_metadata=*/true)); + } + + for (bool register_extension : {true, false}) { + ::arrow::ExtensionTypeGuard guard(register_extension + ? ::arrow::DataTypeVector{variant_extension} + : ::arrow::DataTypeVector{}); + + // Parquet file does contain Arrow schema. + // Field should be interpreted as Parquet Variant extension even though // extensions are not enabled. Review Comment: ```suggestion // Field should be interpreted as Parquet Variant extension, if registered, // even though extensions are not enabled. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org