emkornfield commented on code in PR #17877:
URL: https://github.com/apache/arrow/pull/17877#discussion_r1088477349
##########
cpp/src/parquet/column_reader.cc:
##########
@@ -1803,7 +1805,100 @@ class TypedRecordReader : public
TypedColumnReaderImpl<DType>,
CheckNumberDecoded(num_decoded, values_to_read);
}
- // Return number of logical records read
+ // Reads repeated records and returns number of records read. Fills in
+ // values_to_read and null_count.
+ int64_t ReadRepeatedRecords(int64_t num_records, int64_t* values_to_read,
+ int64_t* null_count) {
+ const int64_t start_levels_position = levels_position_;
+ // For repeated fields, DelimitRecords will update both records_read and
+ // values_to_read.
+ int64_t records_read = DelimitRecords(num_records, values_to_read);
+ if (read_dense_for_nullable_) {
+ ReadValuesDense(*values_to_read);
+ // Any values with def_level < max_def_level is considered null.
+ *null_count = levels_position_ - start_levels_position - *values_to_read;
+ } else {
+ ReadSpacedForNullable(start_levels_position, values_to_read, null_count);
+ DCHECK_EQ(*null_count, levels_position_ - start_levels_position -
*values_to_read);
+ }
+ return records_read;
+ }
+
+ // Reads optional records and returns number of records read. Fills in
+ // values_to_read and null_count.
+ int64_t ReadOptionalRecords(int64_t num_records, int64_t* values_to_read,
+ int64_t* null_count) {
+ const int64_t start_levels_position = levels_position_;
+ // No repetition levels, skip delimiting logic. Each level represents a
+ // null or not null entry
+ int64_t records_read =
+ std::min<int64_t>(levels_written_ - levels_position_, num_records);
+ // This is advanced by DelimitRecords, which we skipped
+ levels_position_ += records_read;
+
+ if (read_dense_for_nullable_) {
+ ReadDenseForOptional(start_levels_position, values_to_read, null_count);
+ } else {
+ ReadSpacedForNullable(start_levels_position, values_to_read, null_count);
+ }
+ return records_read;
+ }
+
+ // Reads optional records and returns number of records read. Fills in
+ // values_to_read and null_count.
+ // null_count will be always 0. Kept the argument to be consistent with the
+ // other functions for repeated/optional.
+ int64_t ReadRequiredRecords(int64_t num_records, int64_t* values_to_read,
+ int64_t* null_count) {
+ *values_to_read = num_records;
+ *null_count = 0;
+ ReadValuesDense(*values_to_read);
+ return num_records;
+ }
+
+ // Reads dense for optional records. First, it figures out number of values
to
+ // read and null count.
+ void ReadDenseForOptional(int64_t start_levels_position, int64_t*
values_to_read,
+ int64_t* null_count) {
+ // levels_position_ must already be incremented based on number of records
+ // read.
+ DCHECK_GE(levels_position_, start_levels_position);
+
+ // When reading dense we need to figure out number of values to read.
+ const int16_t* def_levels = this->def_levels();
+ for (int i = start_levels_position; i < levels_position_; ++i) {
+ if (def_levels[i] == this->max_def_level_) {
+ ++(*values_to_read);
+ } else {
+ ++(*null_count);
+ }
+ }
+ // Any values with def_level < max_def_level is considered null.
+ DCHECK_EQ(*null_count, levels_position_ - start_levels_position -
*values_to_read);
+ ReadValuesDense(*values_to_read);
+ }
+
+ // Reads spaced for optional or repeated fields.
+ void ReadSpacedForNullable(int64_t start_levels_position, int64_t*
values_to_read,
Review Comment:
nit: even though it is verbose, I think being more descriptive here (e.g.
ReadSpacedForNonRequired, or ReadSpacedForOptionalOrRepeated) might be clearer
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]