[GitHub] [arrow] wgtmac commented on a diff in pull request #14353: ARROW-17735: [C++][Parquet] Optimize parquet reading for String/Binary type

via GitHub Tue, 24 Jan 2023 04:41:12 -0800


wgtmac commented on code in PR #14353:
URL: https://github.com/apache/arrow/pull/14353#discussion_r1085225401



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1412,6 +1423,57 @@ class PlainByteArrayDecoder : public 
PlainDecoder<ByteArrayType>,
     return Status::OK();
   }
 
+  Status DecodeArrowDenseZeroCopy(int num_values, int null_count, const 
uint8_t* valid_bits,
+                              int32_t* offset,
+                              std::shared_ptr<::arrow::ResizableBuffer>& 
values,
+                              int64_t valid_bits_offset, int* 
out_values_decoded,
+                              int32_t* bianry_length) {

Review Comment:
   ```suggestion
                                 int32_t* binary_length) {
   ```



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1977,6 +2128,48 @@ class DictByteArrayDecoderImpl : public 
DictDecoderImpl<ByteArrayType>,
     return Status::OK();
   }
 
+  Status DecodeArrowDenseNonNull_opt(int num_values, int32_t* offset,

Review Comment:
   Can we avoid this kind of name and give a meaningful one instead?



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1959,6 +1972,138 @@ class ByteArrayChunkedRecordReader : public 
TypedRecordReader<ByteArrayType>,
   typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
 };
 
+class ByteArrayChunkedOptRecordReader : public 
TypedRecordReader<ByteArrayType>,
+                                        virtual public BinaryRecordReader {
+ public:
+  ByteArrayChunkedOptRecordReader(const ColumnDescriptor* descr, LevelInfo 
leaf_info,
+                                  ::arrow::MemoryPool* pool)
+      : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool) {
+    DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
+    accumulator_.builder.reset(new ::arrow::BinaryBuilder(pool));
+    values_ = AllocateBuffer(pool);
+    offset_ = AllocateBuffer(pool);
+  }
+
+  ::arrow::ArrayVector GetBuilderChunks() override {
+    if (uses_opt_) {
+      std::vector<std::shared_ptr<Buffer>> buffers = {ReleaseIsValid(), 
ReleaseOffsets(),
+                                                      ReleaseValues()};
+      auto data = std::make_shared<::arrow::ArrayData>(
+          ::arrow::binary(), values_written(), buffers, null_count());

Review Comment:
   Is `::arrow::binary()` correct? Will it cause type mismatch if actual type 
is `::arrow::utf8()`? For example, when checking equality with a StringArray, 
the type identity may be broken if one side is binary and the other is utf8.



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1977,6 +2128,48 @@ class DictByteArrayDecoderImpl : public 
DictDecoderImpl<ByteArrayType>,
     return Status::OK();
   }
 
+  Status DecodeArrowDenseNonNull_opt(int num_values, int32_t* offset,
+                                     
std::shared_ptr<::arrow::ResizableBuffer>& values,
+                                     int* out_num_values, int32_t* 
bianry_length) {
+    constexpr int32_t kBufferSize = 2048;
+    int32_t indices[kBufferSize];
+    int values_decoded = 0;
+    uint64_t capacity = values->size();
+
+    // ArrowBinaryHelper helper(out);
+    auto dict_values = reinterpret_cast<const ByteArray*>(dictionary_->data());
+
+    auto dst_value = values->mutable_data() + (*bianry_length);
+    int num_appended = 0;
+
+    while (values_decoded < num_values) {
+      int32_t batch_size = std::min<int32_t>(kBufferSize, num_values - 
values_decoded);
+      int num_indices = idx_decoder_.GetBatch(indices, batch_size);
+      if (num_indices == 0) ParquetException::EofException();

Review Comment:
   It would be helpful to provide some concrete error message.



##########
cpp/src/parquet/column_reader.h:
##########
@@ -291,6 +293,8 @@ class PARQUET_EXPORT RecordReader {
   /// \brief Pre-allocate space for data. Results in better flat read 
performance
   virtual void Reserve(int64_t num_values) = 0;
 
+  virtual void ReserveValues(int64_t capacity) {}

Review Comment:
   Is it better to make it pure virtual? In addition, it helps to add a comment 
for public function.



##########
cpp/src/parquet/column_reader.h:
##########
@@ -55,6 +55,8 @@ static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 
1024 * 1024;
 // 16 KB is the default expected page header size
 static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
 
+static constexpr int32_t kDefaultBinaryPerRowSize = 20;

Review Comment:
   It would be better to add a comment here.



##########
cpp/src/parquet/column_reader.h:
##########
@@ -299,6 +303,8 @@ class PARQUET_EXPORT RecordReader {
   /// allocated in subsequent ReadRecords calls
   virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
 
+  virtual std::shared_ptr<ResizableBuffer> ReleaseOffsets() = 0;
+

Review Comment:
   Same here for a comment.



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -850,12 +850,23 @@ class ColumnReaderImplBase {
     current_encoding_ = encoding;
     current_decoder_->SetData(static_cast<int>(num_buffered_values_), buffer,
                               static_cast<int>(data_size));
+    if (!hasSet_uses_opt_) {
+      if (current_encoding_ == Encoding::PLAIN_DICTIONARY ||
+          current_encoding_ == Encoding::PLAIN ||
+          current_encoding_ == Encoding::RLE_DICTIONARY) {

Review Comment:
   Just curious, why other encodings are not supported?



##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1957,6 +1970,139 @@ class ByteArrayChunkedRecordReader : public 
TypedRecordReader<ByteArrayType>,
   typename EncodingTraits<ByteArrayType>::Accumulator accumulator_;
 };
 
+class ByteArrayChunkedOptRecordReader : public 
TypedRecordReader<ByteArrayType>,
+                                        virtual public BinaryRecordReader {
+ public:
+  ByteArrayChunkedOptRecordReader(const ColumnDescriptor* descr, LevelInfo 
leaf_info,
+                                  ::arrow::MemoryPool* pool)
+      : TypedRecordReader<ByteArrayType>(descr, leaf_info, pool) {
+    DCHECK_EQ(descr_->physical_type(), Type::BYTE_ARRAY);
+    accumulator_.builder.reset(new ::arrow::BinaryBuilder(pool));
+    values_ = AllocateBuffer(pool);
+    offset_ = AllocateBuffer(pool);
+  }
+
+  ::arrow::ArrayVector GetBuilderChunks() override {
+    if (uses_opt_) {
+      std::vector<std::shared_ptr<Buffer>> buffers = {ReleaseIsValid(), 
ReleaseOffsets(),
+                                                      ReleaseValues()};
+      auto data = std::make_shared<::arrow::ArrayData>(
+          ::arrow::binary(), values_written(), buffers, null_count());
+
+      auto chunks = ::arrow::ArrayVector({::arrow::MakeArray(data)});
+      return chunks;
+    } else {
+      ::arrow::ArrayVector result = accumulator_.chunks;
+      if (result.size() == 0 || accumulator_.builder->length() > 0) {
+        std::shared_ptr<::arrow::Array> last_chunk;
+        PARQUET_THROW_NOT_OK(accumulator_.builder->Finish(&last_chunk));
+        result.push_back(std::move(last_chunk));
+      }
+      accumulator_.chunks = {};
+      return result;

Review Comment:
   +1 for not adding a separate class. This would be difficult to maintain if 
more optimization will be added. It would be better if an option can be added 
so that user can manually turn it off when something goes wrong with the new 
feature.



##########
cpp/src/parquet/encoding.h:
##########
@@ -317,6 +317,13 @@ class TypedDecoder : virtual public Decoder {
                           int64_t valid_bits_offset,
                           typename EncodingTraits<DType>::Accumulator* out) = 
0;
 
+  virtual int DecodeArrow_opt(int num_values, int null_count, const uint8_t* 
valid_bits,
+                              int32_t* offset,
+                              std::shared_ptr<::arrow::ResizableBuffer>& 
values,
+                              int64_t valid_bits_offset, int32_t* 
bianry_length) {
+    return 0;

Review Comment:
   When I was at previous employer, we have implemented mutable arrow::Array to 
address similar issue of ORC reader. The idea is that we can know in advance 
the total length of all string/binary values in a single batch. Therefore we 
can pre-allocate the data buffer at once or even reuse previous buffer if it 
has enough capacity. The overhead of buffer allocation and resize operation are 
non-negligible.
   
   @pitrou We have discussed the idea in  
https://issues.apache.org/jira/browse/ARROW-15289 



##########
cpp/src/parquet/encoding.cc:
##########
@@ -1412,6 +1423,57 @@ class PlainByteArrayDecoder : public 
PlainDecoder<ByteArrayType>,
     return Status::OK();
   }
 
+  Status DecodeArrowDenseZeroCopy(int num_values, int null_count, const 
uint8_t* valid_bits,
+                              int32_t* offset,
+                              std::shared_ptr<::arrow::ResizableBuffer>& 
values,
+                              int64_t valid_bits_offset, int* 
out_values_decoded,
+                              int32_t* bianry_length) {

Review Comment:
   There are similar typos below.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] wgtmac commented on a diff in pull request #14353: ARROW-17735: [C++][Parquet] Optimize parquet reading for String/Binary type

Reply via email to