fatemehp commented on code in PR #14142:
URL: https://github.com/apache/arrow/pull/14142#discussion_r988440356
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -60,6 +60,11 @@ using arrow::internal::MultiplyWithOverflow;
namespace bit_util = arrow::bit_util;
namespace parquet {
+
+// The minimum number of repetition/definition levels to decode at a time, for
+// better vectorized performance when doing many smaller record reads
+constexpr int64_t kMinLevelBatchSize = 1024;
Review Comment:
Done.
##########
cpp/src/parquet/column_reader.h:
##########
@@ -339,28 +349,49 @@ class RecordReader {
bool read_dictionary() const { return read_dictionary_; }
protected:
+ /// \brief Indicates if we can have nullable values.
bool nullable_values_;
bool at_record_start_;
int64_t records_read_;
+ /// \brief Stores values. These values are populated based on each
ReadRecords
+ /// call. No extra values are buffered for the next call. SkipRecords will
not
+ /// add any value to this buffer.
+ std::shared_ptr<::arrow::ResizableBuffer> values_;
+ /// \brief False for BYTE_ARRAY, in which case we don't allocate the values
+ /// buffer and we directly read into builder classes.
+ bool uses_values_;
+
+ /// \brief Values that we have read into 'values_' + 'null_count_'.
int64_t values_written_;
int64_t values_capacity_;
int64_t null_count_;
- int64_t levels_written_;
- int64_t levels_position_;
- int64_t levels_capacity_;
-
- std::shared_ptr<::arrow::ResizableBuffer> values_;
- // In the case of false, don't allocate the values buffer (when we directly
read into
- // builder classes).
- bool uses_values_;
-
+ /// \brief Each element corresponds to one element in 'values_' and
specifies if it
Review Comment:
Done.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]