pitrou commented on code in PR #45866:
URL: https://github.com/apache/arrow/pull/45866#discussion_r2016917607
##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -134,16 +135,22 @@ Result<std::shared_ptr<ArrowType>> FromByteArray(
}
}
-Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type,
- int32_t physical_length) {
+Result<std::shared_ptr<ArrowType>> FromFLBA(
+ const LogicalType& logical_type, int32_t physical_length,
+ const ArrowReaderProperties& reader_properties) {
switch (logical_type.type()) {
case LogicalType::Type::DECIMAL:
return MakeArrowDecimal(logical_type);
case LogicalType::Type::FLOAT16:
return ::arrow::float16();
case LogicalType::Type::NONE:
case LogicalType::Type::INTERVAL:
+ return ::arrow::fixed_size_binary(physical_length);
case LogicalType::Type::UUID:
+ if (reader_properties.get_arrow_extensions_enabled()) {
+ return ::arrow::extension::uuid();
Review Comment:
Do we need to check `physical_length` here?
##########
cpp/src/parquet/arrow/arrow_schema_test.cc:
##########
@@ -948,6 +949,68 @@ TEST_F(TestConvertParquetSchema,
ParquetSchemaArrowExtensions) {
}
}
+TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) {
+ std::vector<NodePtr> parquet_fields;
+ parquet_fields.push_back(PrimitiveNode::Make("uuid", Repetition::OPTIONAL,
+ LogicalType::UUID(),
+
ParquetType::FIXED_LEN_BYTE_ARRAY, 16));
+
+ {
+ // Parquet file does not contain Arrow schema.
+ // By default, field should be treated as fixed_size_binary(16) in Arrow.
+ auto arrow_schema =
+ ::arrow::schema({::arrow::field("uuid",
::arrow::fixed_size_binary(16), true)});
+ std::shared_ptr<KeyValueMetadata> metadata{};
+ ASSERT_OK(ConvertSchema(parquet_fields, metadata));
+ CheckFlatSchema(arrow_schema);
+ }
+
+ {
+ // Parquet file does not contain Arrow schema.
+ // If Arrow extensions are enabled, field will be interpreted as uuid()
+ // extension field.
+ ArrowReaderProperties props;
+ props.set_arrow_extensions_enabled(true);
+ auto arrow_schema =
+ ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid(),
true)});
+ std::shared_ptr<KeyValueMetadata> metadata{};
+ ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+ CheckFlatSchema(arrow_schema);
+ }
+
+ {
+ // Parquet file contains Arrow schema.
+ // uuid will be interpreted as uuid() field
+ ArrowReaderProperties props;
+ props.set_arrow_extensions_enabled(false);
+ std::shared_ptr<KeyValueMetadata> field_metadata =
+ ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"});
+ auto arrow_schema = ::arrow::schema(
+ {::arrow::field("uuid", ::arrow::extension::uuid(), true,
field_metadata)});
+
+ std::shared_ptr<KeyValueMetadata> metadata;
+ ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata));
+ ASSERT_OK(ConvertSchema(parquet_fields, metadata, props));
+ CheckFlatSchema(arrow_schema, true /* check_metadata */);
+ }
+
+ {
+ // Parquet file contains Arrow schema.
+ // uuid will be interpreted as uuid() field even though extensions are not
enabled.
Review Comment:
They are below.
##########
python/pyarrow/_dataset_parquet.pyx:
##########
@@ -720,6 +720,9 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions):
Parquet file.
page_checksum_verification : bool, default False
If True, verify the page checksum for each page read from the file.
+ arrow_extensions_enabled : bool, default False
+ If True, read Parquet logical types as Arrow Extension Types where
possible,
Review Comment:
Please see comments I posted on this on the Geometry PR.
##########
python/pyarrow/tests/parquet/test_data_types.py:
##########
@@ -520,4 +520,40 @@ def test_json_extension_type(storage_type):
table = pa.table([arr], names=["ext"])
- _simple_table_roundtrip(table)
+ # With defaults, this should roundtrip (because store_schema=True)
+ _check_roundtrip(table, table)
+
+ # When store_schema is False, we get a string back by default
+ _check_roundtrip(
+ table,
+ pa.table({"ext": pa.array(data, pa.string())}),
+ store_schema=False)
+
+ # With arrow_extensions_enabled=True on read, we get a arrow.uuid back
Review Comment:
This the JSON test, not UUID.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]