wgtmac commented on code in PR #46532: URL: https://github.com/apache/arrow/pull/46532#discussion_r2101790807
########## cpp/src/parquet/properties.h: ########## @@ -1101,6 +1116,7 @@ class PARQUET_EXPORT ArrowReaderProperties { ::arrow::io::IOContext io_context_; ::arrow::io::CacheOptions cache_options_; ::arrow::TimeUnit::type coerce_int96_timestamp_unit_; + ::arrow::Type::type binary_type_; Review Comment: Do we need an equivalent config for string type? Or use the same one? ########## cpp/src/parquet/column_reader.cc: ########## @@ -2243,7 +2267,7 @@ std::shared_ptr<RecordReader> RecordReader::Make(const ColumnDescriptor* descr, read_dense_for_nullable); case Type::BYTE_ARRAY: { return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary, - read_dense_for_nullable); + read_dense_for_nullable, arrow_type); Review Comment: ```suggestion read_dense_for_nullable, std::move(arrow_type)); ``` ########## cpp/src/parquet/properties.h: ########## @@ -1032,6 +1035,18 @@ class PARQUET_EXPORT ArrowReaderProperties { } } + /// \brief Set the Arrow binary type to read BYTE_ARRAY columns as. + /// + /// Allowed values are Type::BINARY, Type::LARGE_BINARY and Type::BINARY_VIEW. + /// Default is Type::BINARY. + /// + /// If a serialized Arrow schema is found in the Parquet metadata, + /// this setting is ignored and the Arrow schema takes precedence + /// (see ArrowWriterProperties::store_schema). + void set_binary_type(::arrow::Type::type value) { binary_type_ = value; } Review Comment: That may introduce more complexity IMHO. For example, we need to deal with column order, type promotion, missing fields, etc. ########## cpp/src/parquet/arrow/schema_internal.cc: ########## @@ -117,34 +118,67 @@ Result<std::shared_ptr<ArrowType>> MakeArrowTimestamp(const LogicalType& logical Result<std::shared_ptr<ArrowType>> FromByteArray( const LogicalType& logical_type, const ArrowReaderProperties& reader_properties, const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) { + auto binary_type = [&]() -> Result<std::shared_ptr<ArrowType>> { + const auto configured_binary_type = reader_properties.binary_type(); + switch (configured_binary_type) { + case ::arrow::Type::BINARY: + return ::arrow::binary(); + case ::arrow::Type::LARGE_BINARY: + return ::arrow::large_binary(); + case ::arrow::Type::BINARY_VIEW: + return ::arrow::binary_view(); + default: + return Status::TypeError("Invalid Arrow type for BYTE_ARRAY columns: ", + ::arrow::internal::ToString(configured_binary_type)); + } + }; + + auto utf8_type = [&]() -> Result<std::shared_ptr<ArrowType>> { Review Comment: OK, this answers my question in `properties.h`. Now we might need some comments to explain that the property applies to string type as well. ########## cpp/src/parquet/column_reader.cc: ########## @@ -2202,26 +2227,25 @@ void TypedRecordReader<ByteArrayType>::DebugPrintState() {} template <> void TypedRecordReader<FLBAType>::DebugPrintState() {} -std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const ColumnDescriptor* descr, - LevelInfo leaf_info, - ::arrow::MemoryPool* pool, - bool read_dictionary, - bool read_dense_for_nullable) { +std::shared_ptr<RecordReader> MakeByteArrayRecordReader( + const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* pool, + bool read_dictionary, bool read_dense_for_nullable, + std::shared_ptr<::arrow::DataType> arrow_type) { if (read_dictionary) { return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, pool, read_dense_for_nullable); } else { - return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, pool, - read_dense_for_nullable); + return std::make_shared<ByteArrayChunkedRecordReader>( + descr, leaf_info, pool, read_dense_for_nullable, arrow_type); Review Comment: ```suggestion descr, leaf_info, pool, read_dense_for_nullable, std::move(arrow_type)); ``` ########## cpp/src/parquet/column_reader.cc: ########## @@ -2069,11 +2070,35 @@ class ByteArrayChunkedRecordReader final : public TypedRecordReader<ByteArrayTyp virtual public BinaryRecordReader { public: ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, - ::arrow::MemoryPool* pool, bool read_dense_for_nullable) + ::arrow::MemoryPool* pool, bool read_dense_for_nullable, + std::shared_ptr<::arrow::DataType> arrow_type) : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool, read_dense_for_nullable) { ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY); - accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool); + auto arrow_binary_type = arrow_type ? arrow_type->id() : ::arrow::Type::BINARY; Review Comment: Any chance that `arrow_type` will actually be null? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org