View types directly [arrow]

via GitHub Thu, 22 May 2025 00:14:41 -0700


wgtmac commented on code in PR #46532:
URL: https://github.com/apache/arrow/pull/46532#discussion_r2101790807



##########
cpp/src/parquet/properties.h:
##########
@@ -1101,6 +1116,7 @@ class PARQUET_EXPORT ArrowReaderProperties {
   ::arrow::io::IOContext io_context_;
   ::arrow::io::CacheOptions cache_options_;
   ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+  ::arrow::Type::type binary_type_;

Review Comment:
   Do we need an equivalent config for string type? Or use the same one?



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2243,7 +2267,7 @@ std::shared_ptr<RecordReader> RecordReader::Make(const 
ColumnDescriptor* descr,
                                                              
read_dense_for_nullable);
     case Type::BYTE_ARRAY: {
       return MakeByteArrayRecordReader(descr, leaf_info, pool, read_dictionary,
-                                       read_dense_for_nullable);
+                                       read_dense_for_nullable, arrow_type);

Review Comment:
   ```suggestion
                                          read_dense_for_nullable, 
std::move(arrow_type));
   ```



##########
cpp/src/parquet/properties.h:
##########
@@ -1032,6 +1035,18 @@ class PARQUET_EXPORT ArrowReaderProperties {
     }
   }
 
+  /// \brief Set the Arrow binary type to read BYTE_ARRAY columns as.
+  ///
+  /// Allowed values are Type::BINARY, Type::LARGE_BINARY and 
Type::BINARY_VIEW.
+  /// Default is Type::BINARY.
+  ///
+  /// If a serialized Arrow schema is found in the Parquet metadata,
+  /// this setting is ignored and the Arrow schema takes precedence
+  /// (see ArrowWriterProperties::store_schema).
+  void set_binary_type(::arrow::Type::type value) { binary_type_ = value; }

Review Comment:
   That may introduce more complexity IMHO. For example, we need to deal with 
column order, type promotion, missing fields, etc.



##########
cpp/src/parquet/arrow/schema_internal.cc:
##########
@@ -117,34 +118,67 @@ Result<std::shared_ptr<ArrowType>> 
MakeArrowTimestamp(const LogicalType& logical
 Result<std::shared_ptr<ArrowType>> FromByteArray(
     const LogicalType& logical_type, const ArrowReaderProperties& 
reader_properties,
     const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata) {
+  auto binary_type = [&]() -> Result<std::shared_ptr<ArrowType>> {
+    const auto configured_binary_type = reader_properties.binary_type();
+    switch (configured_binary_type) {
+      case ::arrow::Type::BINARY:
+        return ::arrow::binary();
+      case ::arrow::Type::LARGE_BINARY:
+        return ::arrow::large_binary();
+      case ::arrow::Type::BINARY_VIEW:
+        return ::arrow::binary_view();
+      default:
+        return Status::TypeError("Invalid Arrow type for BYTE_ARRAY columns: ",
+                                 
::arrow::internal::ToString(configured_binary_type));
+    }
+  };
+
+  auto utf8_type = [&]() -> Result<std::shared_ptr<ArrowType>> {

Review Comment:
   OK, this answers my question in `properties.h`. Now we might need some 
comments to explain that the property applies to string type as well.



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2202,26 +2227,25 @@ void 
TypedRecordReader<ByteArrayType>::DebugPrintState() {}
 template <>
 void TypedRecordReader<FLBAType>::DebugPrintState() {}
 
-std::shared_ptr<RecordReader> MakeByteArrayRecordReader(const 
ColumnDescriptor* descr,
-                                                        LevelInfo leaf_info,
-                                                        ::arrow::MemoryPool* 
pool,
-                                                        bool read_dictionary,
-                                                        bool 
read_dense_for_nullable) {
+std::shared_ptr<RecordReader> MakeByteArrayRecordReader(
+    const ColumnDescriptor* descr, LevelInfo leaf_info, ::arrow::MemoryPool* 
pool,
+    bool read_dictionary, bool read_dense_for_nullable,
+    std::shared_ptr<::arrow::DataType> arrow_type) {
   if (read_dictionary) {
     return std::make_shared<ByteArrayDictionaryRecordReader>(descr, leaf_info, 
pool,
                                                              
read_dense_for_nullable);
   } else {
-    return std::make_shared<ByteArrayChunkedRecordReader>(descr, leaf_info, 
pool,
-                                                          
read_dense_for_nullable);
+    return std::make_shared<ByteArrayChunkedRecordReader>(
+        descr, leaf_info, pool, read_dense_for_nullable, arrow_type);

Review Comment:
   ```suggestion
           descr, leaf_info, pool, read_dense_for_nullable, 
std::move(arrow_type));
   ```



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -2069,11 +2070,35 @@ class ByteArrayChunkedRecordReader final : public 
TypedRecordReader<ByteArrayTyp
                                            virtual public BinaryRecordReader {
  public:
   ByteArrayChunkedRecordReader(const ColumnDescriptor* descr, LevelInfo 
leaf_info,
-                               ::arrow::MemoryPool* pool, bool 
read_dense_for_nullable)
+                               ::arrow::MemoryPool* pool, bool 
read_dense_for_nullable,
+                               std::shared_ptr<::arrow::DataType> arrow_type)
       : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool,
                                          read_dense_for_nullable) {
     ARROW_DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
-    accumulator_.builder = std::make_unique<::arrow::BinaryBuilder>(pool);
+    auto arrow_binary_type = arrow_type ? arrow_type->id() : 
::arrow::Type::BINARY;

Review Comment:
   Any chance that `arrow_type` will actually be null?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] GH-43041: [C++][Python] Read/write Parquet BYTE_ARRAY as Large/View types directly [arrow]

Reply via email to