wgtmac commented on code in PR #46532:
URL: https://github.com/apache/arrow/pull/46532#discussion_r2101790807
##########
cpp/src/parquet/properties.h:
##########
@@ -1101,6 +1116,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
::arrow::io::IOContext io_context_;
::arrow::io::CacheOptions cache_options_;
::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+ ::arrow::Type::type binary_type_;
Review Comment:
Do we need an equivalent config for string type? Or use the same one?
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2243,7 +2267,7 @@ std::shared_ptr<RecordReader> RecordReader::Make(const
ColumnDescriptor* descr,
read_dense_for_nullable);
case Type::BYTE_ARRAY: {
return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
- read_dense_for_nullable);
+ read_dense_for_nullable, arrow_type);
Review Comment:
```suggestion
read_dense_for_nullable,
std::move(arrow_type));
```
##########
cpp/src/parquet/properties.h:
##########
@@ -1032,6 +1035,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
}
}
+ /// \brief Set the Arrow binary type to read BYTE_ARRAY columns as.
+ ///
+ /// Allowed values are Type::BINARY, Type::LARGE_BINARY and
Type::BINARY_VIEW.
+ /// Default is Type::BINARY.
+ ///
+ /// If a serialized Arrow schema is found in the Parquet metadata,
+ /// this setting is ignored and the Arrow schema takes precedence
+ /// (see ArrowWriterProperties::store_schema).
+ void set_binary_type(::arrow::Type::type value) { binary_type_ = value; }
Review Comment:
That may introduce more complexity IMHO. For example, we need to deal with
column order, type promotion, missing fields, etc.
##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -117,34 +118,67 @@ Result<std::shared_ptr<ArrowType>>
MakeArrowTimestamp(const LogicalType& logical
Result<std::shared_ptr<ArrowType>> FromByteArray(
const LogicalType& logical_type, const ArrowReaderProperties&
reader_properties,
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+ auto binary_type = [&]() -> Result<std::shared_ptr<ArrowType>> {
+ const auto configured_binary_type = reader_properties.binary_type();
+ switch (configured_binary_type) {
+ case ::arrow::Type::BINARY:
+ return ::arrow::binary();
+ case ::arrow::Type::LARGE_BINARY:
+ return ::arrow::large_binary();
+ case ::arrow::Type::BINARY_VIEW:
+ return ::arrow::binary_view();
+ default:
+ return Status::TypeError("Invalid Arrow type for BYTE_ARRAY columns: ",
+
::arrow::internal::ToString(configured_binary_type));
+ }
+ };
+
+ auto utf8_type = [&]() -> Result<std::shared_ptr<ArrowType>> {
Review Comment:
OK, this answers my question in `properties.h`. Now we might need some
comments to explain that the property applies to string type as well.
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2202,26 +2227,25 @@ void
TypedRecordReader<ByteArrayType>::DebugPrintState() {}
template <>
void TypedRecordReader<FLBAType>::DebugPrintState() {}
-std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const
ColumnDescriptor* descr,
- LevelInfo leaf_info,
- ::arrow::MemoryPool*
pool,
- bool read_dictionary,
- bool
read_dense_for_nullable) {
+std::shared_ptr<RecordReader> MakeByteArrayRecordReader(
+ const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool*
pool,
+ bool read_dictionary, bool read_dense_for_nullable,
+ std::shared_ptr<::arrow::DataType> arrow_type) {
if (read_dictionary) {
return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info,
pool,
read_dense_for_nullable);
} else {
- return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info,
pool,
-
read_dense_for_nullable);
+ return std::make_shared<ByteArrayChunkedRecordReader>(
+ descr, leaf_info, pool, read_dense_for_nullable, arrow_type);
Review Comment:
```suggestion
descr, leaf_info, pool, read_dense_for_nullable,
std::move(arrow_type));
```
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2069,11 +2070,35 @@ class ByteArrayChunkedRecordReader final : public
TypedRecordReader<ByteArrayTyp
virtual public BinaryRecordReader {
public:
ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo
leaf_info,
- ::arrow::MemoryPool* pool, bool
read_dense_for_nullable)
+ ::arrow::MemoryPool* pool, bool
read_dense_for_nullable,
+ std::shared_ptr<::arrow::DataType> arrow_type)
: TypedRecordReader<ByteArrayType>(descr, leaf_info, pool,
read_dense_for_nullable) {
ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
- accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool);
+ auto arrow_binary_type = arrow_type ? arrow_type->id() :
::arrow::Type::BINARY;
Review Comment:
Any chance that `arrow_type` will actually be null?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]